#Predicting Online News Popularity based on Pre-publication Features #Initial Results and Code
#Link to dataset: https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity
#Link to data: https://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip
#Sections: #Step 1 - Data Processing #Step 2 - Exploratory Analysis #Step 3 - Dimensionality Reduction #Step 4 - Experimental Design #Step 5 - Prediction & Step 6 - Performance Evaluation
#Step 1 - Data Processing
#The “Online News Popularity Data Set” (Fernandes et al., 2015) is retrieved from the UCI Machine Learning Repository, #and data processing procedures are implemented. At the start of the univariate analysis, the target variable is #determined to be the number of shares; and articles are classified as popular or unpopular based on the number of #shares. Various graphs are plotted such as boxplots and histograms to detect outliers, illustrate whether the data is #normally distributed, and whether the data is balanced. To determine whether there are any variables with low variance, #a near-zero variance filter is used to filter out the attributes with near-zero variance and help to reduce the #dimensionality of the dataset. In bivariate analysis, pairwise relations are examined between the input variables as #well as the output and input variables. Scatterplots provide visualisations to better understand the data; and #correlation analysis is done to help reduce the dimensionality of the dataset when there is high correlation among the #independent variables.
#Initial Analysis - Univariate Analysis
#Read the data
temp <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip",temp)
pop <- read.table(unz(temp, "OnlineNewsPopularity/OnlineNewsPopularity.csv"), header = TRUE, sep = ",",
stringsAsFactors = FALSE, na.strings = c("", "NA"))
unlink(temp)
#View the first and last few rows of the dataset
head(pop)
## url timedelta
## 1 http://mashable.com/2013/01/07/amazon-instant-video-browser/ 731
## 2 http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/ 731
## 3 http://mashable.com/2013/01/07/apple-40-billion-app-downloads/ 731
## 4 http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/ 731
## 5 http://mashable.com/2013/01/07/att-u-verse-apps/ 731
## 6 http://mashable.com/2013/01/07/beewi-smart-toys/ 731
## n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## 1 12 219 0.6635945 1
## 2 9 255 0.6047431 1
## 3 9 211 0.5751295 1
## 4 9 531 0.5037879 1
## 5 13 1072 0.4156456 1
## 6 10 370 0.5598886 1
## n_non_stop_unique_tokens num_hrefs num_self_hrefs num_imgs num_videos
## 1 0.8153846 4 2 1 0
## 2 0.7919463 3 1 1 0
## 3 0.6638655 3 1 1 0
## 4 0.6656347 9 0 1 0
## 5 0.5408895 19 19 20 0
## 6 0.6981982 2 2 0 0
## average_token_length num_keywords data_channel_is_lifestyle
## 1 4.680365 5 0
## 2 4.913725 4 0
## 3 4.393365 6 0
## 4 4.404896 7 0
## 5 4.682836 7 0
## 6 4.359459 9 0
## data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
## 1 1 0 0
## 2 0 1 0
## 3 0 1 0
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
## data_channel_is_tech data_channel_is_world kw_min_min kw_max_min kw_avg_min
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 1 0 0 0 0
## 6 1 0 0 0 0
## kw_min_max kw_max_max kw_avg_max kw_min_avg kw_max_avg kw_avg_avg
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## self_reference_min_shares self_reference_max_shares
## 1 496 496
## 2 0 0
## 3 918 918
## 4 0 0
## 5 545 16000
## 6 8500 8500
## self_reference_avg_sharess weekday_is_monday weekday_is_tuesday
## 1 496.000 1 0
## 2 0.000 1 0
## 3 918.000 1 0
## 4 0.000 1 0
## 5 3151.158 1 0
## 6 8500.000 1 0
## weekday_is_wednesday weekday_is_thursday weekday_is_friday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## weekday_is_saturday weekday_is_sunday is_weekend LDA_00 LDA_01
## 1 0 0 0 0.50033120 0.37827893
## 2 0 0 0 0.79975569 0.05004668
## 3 0 0 0 0.21779229 0.03333446
## 4 0 0 0 0.02857322 0.41929964
## 5 0 0 0 0.02863281 0.02879355
## 6 0 0 0 0.02224528 0.30671758
## LDA_02 LDA_03 LDA_04 global_subjectivity
## 1 0.04000468 0.04126265 0.04012254 0.5216171
## 2 0.05009625 0.05010067 0.05000071 0.3412458
## 3 0.03335142 0.03333354 0.68218829 0.7022222
## 4 0.49465083 0.02890472 0.02857160 0.4298497
## 5 0.02857518 0.02857168 0.88542678 0.5135021
## 6 0.02223128 0.02222429 0.62658158 0.4374086
## global_sentiment_polarity global_rate_positive_words
## 1 0.09256198 0.04566210
## 2 0.14894781 0.04313725
## 3 0.32333333 0.05687204
## 4 0.10070467 0.04143126
## 5 0.28100348 0.07462687
## 6 0.07118419 0.02972973
## global_rate_negative_words rate_positive_words rate_negative_words
## 1 0.013698630 0.7692308 0.2307692
## 2 0.015686275 0.7333333 0.2666667
## 3 0.009478673 0.8571429 0.1428571
## 4 0.020715631 0.6666667 0.3333333
## 5 0.012126866 0.8602151 0.1397849
## 6 0.027027027 0.5238095 0.4761905
## avg_positive_polarity min_positive_polarity max_positive_polarity
## 1 0.3786364 0.10000000 0.7
## 2 0.2869146 0.03333333 0.7
## 3 0.4958333 0.10000000 1.0
## 4 0.3859652 0.13636364 0.8
## 5 0.4111274 0.03333333 1.0
## 6 0.3506100 0.13636364 0.6
## avg_negative_polarity min_negative_polarity max_negative_polarity
## 1 -0.3500000 -0.600 -0.2000000
## 2 -0.1187500 -0.125 -0.1000000
## 3 -0.4666667 -0.800 -0.1333333
## 4 -0.3696970 -0.600 -0.1666667
## 5 -0.2201923 -0.500 -0.0500000
## 6 -0.1950000 -0.400 -0.1000000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## 1 0.5000000 -0.1875000 0.00000000
## 2 0.0000000 0.0000000 0.50000000
## 3 0.0000000 0.0000000 0.50000000
## 4 0.0000000 0.0000000 0.50000000
## 5 0.4545455 0.1363636 0.04545455
## 6 0.6428571 0.2142857 0.14285714
## abs_title_sentiment_polarity shares
## 1 0.1875000 593
## 2 0.0000000 711
## 3 0.0000000 1500
## 4 0.0000000 1200
## 5 0.1363636 505
## 6 0.2142857 855
tail(pop)
## url
## 39639 http://mashable.com/2014/12/27/protests-continue-ramos-funeral/
## 39640 http://mashable.com/2014/12/27/samsung-app-autism/
## 39641 http://mashable.com/2014/12/27/seth-rogen-james-franco-will-live-tweet-the-interview/
## 39642 http://mashable.com/2014/12/27/son-pays-off-mortgage/
## 39643 http://mashable.com/2014/12/27/ukraine-blasts/
## 39644 http://mashable.com/2014/12/27/youtube-channels-2015/
## timedelta n_tokens_title n_tokens_content n_unique_tokens
## 39639 8 11 223 0.6531532
## 39640 8 11 346 0.5290520
## 39641 8 12 328 0.6962963
## 39642 8 10 442 0.5163551
## 39643 8 6 682 0.5394933
## 39644 8 10 157 0.7019868
## n_non_stop_words n_non_stop_unique_tokens num_hrefs num_self_hrefs
## 39639 1 0.8257576 5 3
## 39640 1 0.6847826 9 7
## 39641 1 0.8850575 9 7
## 39642 1 0.6441281 24 1
## 39643 1 0.6926605 10 1
## 39644 1 0.8461538 1 1
## num_imgs num_videos average_token_length num_keywords
## 39639 1 0 4.923767 6
## 39640 1 1 4.523121 8
## 39641 3 48 4.405488 7
## 39642 12 1 5.076923 8
## 39643 1 0 4.975073 5
## 39644 0 2 4.471338 4
## data_channel_is_lifestyle data_channel_is_entertainment
## 39639 0 0
## 39640 0 0
## 39641 0 0
## 39642 0 0
## 39643 0 0
## 39644 0 1
## data_channel_is_bus data_channel_is_socmed data_channel_is_tech
## 39639 1 0 0
## 39640 0 0 1
## 39641 0 1 0
## 39642 0 0 0
## 39643 0 0 0
## 39644 0 0 0
## data_channel_is_world kw_min_min kw_max_min kw_avg_min kw_min_max
## 39639 0 -1 459 91.000 0
## 39640 0 -1 671 173.125 26900
## 39641 0 -1 616 184.000 6500
## 39642 0 -1 691 168.250 6200
## 39643 1 -1 0 -1.000 0
## 39644 0 -1 97 23.500 205600
## kw_max_max kw_avg_max kw_min_avg kw_max_avg kw_avg_avg
## 39639 843300 484083.3 0.000 4301.332 2665.713
## 39640 843300 374962.5 2514.743 4004.343 3031.116
## 39641 843300 192985.7 1664.268 5470.169 3411.661
## 39642 843300 295850.0 1753.882 6880.687 4206.439
## 39643 843300 254600.0 0.000 3384.317 1777.896
## 39644 843300 366200.0 3035.081 3613.513 3296.909
## self_reference_min_shares self_reference_max_shares
## 39639 2000 5700
## 39640 11400 48000
## 39641 2100 2100
## 39642 1400 1400
## 39643 452 452
## 39644 2100 2100
## self_reference_avg_sharess weekday_is_monday weekday_is_tuesday
## 39639 3633.333 0 0
## 39640 37033.333 0 0
## 39641 2100.000 0 0
## 39642 1400.000 0 0
## 39643 452.000 0 0
## 39644 2100.000 0 0
## weekday_is_wednesday weekday_is_thursday weekday_is_friday
## 39639 1 0 0
## 39640 1 0 0
## 39641 1 0 0
## 39642 1 0 0
## 39643 1 0 0
## 39644 1 0 0
## weekday_is_saturday weekday_is_sunday is_weekend LDA_00 LDA_01
## 39639 0 0 0 0.55133808 0.03333719
## 39640 0 0 0 0.02503777 0.02500062
## 39641 0 0 0 0.02934870 0.02857493
## 39642 0 0 0 0.15900446 0.02502466
## 39643 0 0 0 0.04000361 0.04000349
## 39644 0 0 0 0.05000126 0.79933895
## LDA_02 LDA_03 LDA_04 global_subjectivity
## 39639 0.03334721 0.03333504 0.34864248 0.5520408
## 39640 0.15170116 0.02500011 0.77326035 0.4826786
## 39641 0.23186607 0.68163487 0.02857542 0.5643743
## 39642 0.02520734 0.64379353 0.14697000 0.5102958
## 39643 0.83998726 0.04000210 0.04000355 0.3585776
## 39644 0.05000041 0.05065874 0.05000064 0.5178932
## global_sentiment_polarity global_rate_positive_words
## 39639 0.268877551 0.03139013
## 39640 0.141964286 0.03757225
## 39641 0.194249311 0.03963415
## 39642 0.024608586 0.03393665
## 39643 -0.008065863 0.02052786
## 39644 0.104891775 0.06369427
## global_rate_negative_words rate_positive_words rate_negative_words
## 39639 0.004484305 0.8750000 0.1250000
## 39640 0.014450867 0.7222222 0.2777778
## 39641 0.009146341 0.8125000 0.1875000
## 39642 0.024886878 0.5769231 0.4230769
## 39643 0.023460411 0.4666667 0.5333333
## 39644 0.012738854 0.8333333 0.1666667
## avg_positive_polarity min_positive_polarity max_positive_polarity
## 39639 0.5734694 0.2142857 0.80
## 39640 0.3337912 0.1000000 0.75
## 39641 0.3748252 0.1363636 0.70
## 39642 0.3072727 0.1363636 0.50
## 39643 0.2368506 0.0625000 0.50
## 39644 0.2473377 0.1000000 0.50
## avg_negative_polarity min_negative_polarity max_negative_polarity
## 39639 -0.2500000 -0.25 -0.2500000
## 39640 -0.2600000 -0.50 -0.1250000
## 39641 -0.2111111 -0.40 -0.1000000
## 39642 -0.3564394 -0.80 -0.1666667
## 39643 -0.2052455 -0.50 -0.0125000
## 39644 -0.2000000 -0.20 -0.2000000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## 39639 0.0000000 0.0000000 0.50000000
## 39640 0.1000000 0.0000000 0.40000000
## 39641 0.3000000 1.0000000 0.20000000
## 39642 0.4545455 0.1363636 0.04545455
## 39643 0.0000000 0.0000000 0.50000000
## 39644 0.3333333 0.2500000 0.16666667
## abs_title_sentiment_polarity shares
## 39639 0.0000000 1200
## 39640 0.0000000 1800
## 39641 1.0000000 1900
## 39642 0.1363636 1900
## 39643 0.0000000 1100
## 39644 0.2500000 1300
#Number of rows
nrow(pop)
## [1] 39644
#39644 rows
#Number of attributes
length(pop)
## [1] 61
#61 variables
#Data type of attributes
sapply(pop, class)
## url timedelta
## "character" "numeric"
## n_tokens_title n_tokens_content
## "numeric" "numeric"
## n_unique_tokens n_non_stop_words
## "numeric" "numeric"
## n_non_stop_unique_tokens num_hrefs
## "numeric" "numeric"
## num_self_hrefs num_imgs
## "numeric" "numeric"
## num_videos average_token_length
## "numeric" "numeric"
## num_keywords data_channel_is_lifestyle
## "numeric" "numeric"
## data_channel_is_entertainment data_channel_is_bus
## "numeric" "numeric"
## data_channel_is_socmed data_channel_is_tech
## "numeric" "numeric"
## data_channel_is_world kw_min_min
## "numeric" "numeric"
## kw_max_min kw_avg_min
## "numeric" "numeric"
## kw_min_max kw_max_max
## "numeric" "numeric"
## kw_avg_max kw_min_avg
## "numeric" "numeric"
## kw_max_avg kw_avg_avg
## "numeric" "numeric"
## self_reference_min_shares self_reference_max_shares
## "numeric" "numeric"
## self_reference_avg_sharess weekday_is_monday
## "numeric" "numeric"
## weekday_is_tuesday weekday_is_wednesday
## "numeric" "numeric"
## weekday_is_thursday weekday_is_friday
## "numeric" "numeric"
## weekday_is_saturday weekday_is_sunday
## "numeric" "numeric"
## is_weekend LDA_00
## "numeric" "numeric"
## LDA_01 LDA_02
## "numeric" "numeric"
## LDA_03 LDA_04
## "numeric" "numeric"
## global_subjectivity global_sentiment_polarity
## "numeric" "numeric"
## global_rate_positive_words global_rate_negative_words
## "numeric" "numeric"
## rate_positive_words rate_negative_words
## "numeric" "numeric"
## avg_positive_polarity min_positive_polarity
## "numeric" "numeric"
## max_positive_polarity avg_negative_polarity
## "numeric" "numeric"
## min_negative_polarity max_negative_polarity
## "numeric" "numeric"
## title_subjectivity title_sentiment_polarity
## "numeric" "numeric"
## abs_title_subjectivity abs_title_sentiment_polarity
## "numeric" "numeric"
## shares
## "integer"
#The "shares" column is of type integer, "url" is of type character, and the remainder of the variables are of type
#numeric.
#Structure of the dataset
str(pop)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : chr "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
#Number of missing values
sum(is.na(pop) == TRUE)
## [1] 0
#There are no missing values in the dataset.
#Remove the non-predictive columns from the dataset
pop_new <- pop[3:61]
#Dimensions of the new dataset
dim(pop_new)
## [1] 39644 59
#39644 rows and 59 columns
#Add a column that categorizes articles into binary levels of popularity according to the number of shares
#If the number of shares is less than or equal to 1400, label the article as unpopular ("0"), whereas if the number of
#shares is greater than 1400, label the article as popular ("1").
pop_new$shares_cat <- NA
pop_new$shares_cat <- ifelse(pop_new$shares <= 1400, 0, 1)
#Change categorical variables from numeric to factor
pop_new$data_channel_is_lifestyle <- as.factor(pop_new$data_channel_is_lifestyle)
pop_new$data_channel_is_entertainment <- as.factor(pop_new$data_channel_is_entertainment)
pop_new$data_channel_is_bus <- as.factor(pop_new$data_channel_is_bus)
pop_new$data_channel_is_socmed <- as.factor(pop_new$data_channel_is_socmed)
pop_new$data_channel_is_tech <- as.factor(pop_new$data_channel_is_tech)
pop_new$data_channel_is_world <- as.factor(pop_new$data_channel_is_world)
pop_new$weekday_is_monday <- as.factor(pop_new$weekday_is_monday)
pop_new$weekday_is_tuesday <- as.factor(pop_new$weekday_is_tuesday)
pop_new$weekday_is_wednesday <- as.factor(pop_new$weekday_is_wednesday)
pop_new$weekday_is_thursday <- as.factor(pop_new$weekday_is_thursday)
pop_new$weekday_is_friday <- as.factor(pop_new$weekday_is_friday)
pop_new$weekday_is_saturday <- as.factor(pop_new$weekday_is_saturday)
pop_new$weekday_is_sunday <- as.factor(pop_new$weekday_is_sunday)
pop_new$is_weekend <- as.factor(pop_new$is_weekend)
pop_new$shares_cat <- as.factor(pop_new$shares_cat)
#Check the structure of the new dataset
str(pop_new)
## 'data.frame': 39644 obs. of 60 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ data_channel_is_entertainment: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 1 ...
## $ data_channel_is_bus : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
## $ data_channel_is_socmed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ data_channel_is_tech : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 2 2 1 ...
## $ data_channel_is_world : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ weekday_is_tuesday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_wednesday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_thursday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_friday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_saturday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_sunday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ is_weekend : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
## $ shares_cat : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 1 ...
#Five number summary for the numeric attributes; and levels and frequency tables for factors
summary(pop_new)
## n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## Min. : 2.0 Min. : 0.0 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 9.0 1st Qu.: 246.0 1st Qu.: 0.4709 1st Qu.: 1.0000
## Median :10.0 Median : 409.0 Median : 0.5392 Median : 1.0000
## Mean :10.4 Mean : 546.5 Mean : 0.5482 Mean : 0.9965
## 3rd Qu.:12.0 3rd Qu.: 716.0 3rd Qu.: 0.6087 3rd Qu.: 1.0000
## Max. :23.0 Max. :8474.0 Max. :701.0000 Max. :1042.0000
## n_non_stop_unique_tokens num_hrefs num_self_hrefs num_imgs
## Min. : 0.0000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.6257 1st Qu.: 4.00 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 0.6905 Median : 8.00 Median : 3.000 Median : 1.000
## Mean : 0.6892 Mean : 10.88 Mean : 3.294 Mean : 4.544
## 3rd Qu.: 0.7546 3rd Qu.: 14.00 3rd Qu.: 4.000 3rd Qu.: 4.000
## Max. :650.0000 Max. :304.00 Max. :116.000 Max. :128.000
## num_videos average_token_length num_keywords
## Min. : 0.00 Min. :0.000 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.:4.478 1st Qu.: 6.000
## Median : 0.00 Median :4.664 Median : 7.000
## Mean : 1.25 Mean :4.548 Mean : 7.224
## 3rd Qu.: 1.00 3rd Qu.:4.855 3rd Qu.: 9.000
## Max. :91.00 Max. :8.042 Max. :10.000
## data_channel_is_lifestyle data_channel_is_entertainment data_channel_is_bus
## 0:37545 0:32587 0:33386
## 1: 2099 1: 7057 1: 6258
##
##
##
##
## data_channel_is_socmed data_channel_is_tech data_channel_is_world
## 0:37321 0:32298 0:31217
## 1: 2323 1: 7346 1: 8427
##
##
##
##
## kw_min_min kw_max_min kw_avg_min kw_min_max
## Min. : -1.00 Min. : 0 Min. : -1.0 Min. : 0
## 1st Qu.: -1.00 1st Qu.: 445 1st Qu.: 141.8 1st Qu.: 0
## Median : -1.00 Median : 660 Median : 235.5 Median : 1400
## Mean : 26.11 Mean : 1154 Mean : 312.4 Mean : 13612
## 3rd Qu.: 4.00 3rd Qu.: 1000 3rd Qu.: 357.0 3rd Qu.: 7900
## Max. :377.00 Max. :298400 Max. :42827.9 Max. :843300
## kw_max_max kw_avg_max kw_min_avg kw_max_avg
## Min. : 0 Min. : 0 Min. : -1 Min. : 0
## 1st Qu.:843300 1st Qu.:172847 1st Qu.: 0 1st Qu.: 3562
## Median :843300 Median :244572 Median :1024 Median : 4356
## Mean :752324 Mean :259282 Mean :1117 Mean : 5657
## 3rd Qu.:843300 3rd Qu.:330980 3rd Qu.:2057 3rd Qu.: 6020
## Max. :843300 Max. :843300 Max. :3613 Max. :298400
## kw_avg_avg self_reference_min_shares self_reference_max_shares
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 2382 1st Qu.: 639 1st Qu.: 1100
## Median : 2870 Median : 1200 Median : 2800
## Mean : 3136 Mean : 3999 Mean : 10329
## 3rd Qu.: 3600 3rd Qu.: 2600 3rd Qu.: 8000
## Max. :43568 Max. :843300 Max. :843300
## self_reference_avg_sharess weekday_is_monday weekday_is_tuesday
## Min. : 0.0 0:32983 0:32254
## 1st Qu.: 981.2 1: 6661 1: 7390
## Median : 2200.0
## Mean : 6401.7
## 3rd Qu.: 5200.0
## Max. :843300.0
## weekday_is_wednesday weekday_is_thursday weekday_is_friday weekday_is_saturday
## 0:32209 0:32377 0:33943 0:37191
## 1: 7435 1: 7267 1: 5701 1: 2453
##
##
##
##
## weekday_is_sunday is_weekend LDA_00 LDA_01
## 0:36907 0:34454 Min. :0.00000 Min. :0.00000
## 1: 2737 1: 5190 1st Qu.:0.02505 1st Qu.:0.02501
## Median :0.03339 Median :0.03334
## Mean :0.18460 Mean :0.14126
## 3rd Qu.:0.24096 3rd Qu.:0.15083
## Max. :0.92699 Max. :0.92595
## LDA_02 LDA_03 LDA_04 global_subjectivity
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.3962
## Median :0.04000 Median :0.04000 Median :0.04073 Median :0.4535
## Mean :0.21632 Mean :0.22377 Mean :0.23403 Mean :0.4434
## 3rd Qu.:0.33422 3rd Qu.:0.37576 3rd Qu.:0.39999 3rd Qu.:0.5083
## Max. :0.92000 Max. :0.92653 Max. :0.92719 Max. :1.0000
## global_sentiment_polarity global_rate_positive_words
## Min. :-0.39375 Min. :0.00000
## 1st Qu.: 0.05776 1st Qu.:0.02838
## Median : 0.11912 Median :0.03902
## Mean : 0.11931 Mean :0.03962
## 3rd Qu.: 0.17783 3rd Qu.:0.05028
## Max. : 0.72784 Max. :0.15549
## global_rate_negative_words rate_positive_words rate_negative_words
## Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.009615 1st Qu.:0.6000 1st Qu.:0.1852
## Median :0.015337 Median :0.7105 Median :0.2800
## Mean :0.016612 Mean :0.6822 Mean :0.2879
## 3rd Qu.:0.021739 3rd Qu.:0.8000 3rd Qu.:0.3846
## Max. :0.184932 Max. :1.0000 Max. :1.0000
## avg_positive_polarity min_positive_polarity max_positive_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05000 1st Qu.:0.6000
## Median :0.3588 Median :0.10000 Median :0.8000
## Mean :0.3538 Mean :0.09545 Mean :0.7567
## 3rd Qu.:0.4114 3rd Qu.:0.10000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## avg_negative_polarity min_negative_polarity max_negative_polarity
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:-0.3284 1st Qu.:-0.7000 1st Qu.:-0.1250
## Median :-0.2533 Median :-0.5000 Median :-0.1000
## Mean :-0.2595 Mean :-0.5219 Mean :-0.1075
## 3rd Qu.:-0.1869 3rd Qu.:-0.3000 3rd Qu.:-0.0500
## Max. : 0.0000 Max. : 0.0000 Max. : 0.0000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## Min. :0.0000 Min. :-1.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.1667
## Median :0.1500 Median : 0.00000 Median :0.5000
## Mean :0.2824 Mean : 0.07143 Mean :0.3418
## 3rd Qu.:0.5000 3rd Qu.: 0.15000 3rd Qu.:0.5000
## Max. :1.0000 Max. : 1.00000 Max. :0.5000
## abs_title_sentiment_polarity shares shares_cat
## Min. :0.0000 Min. : 1 0:20082
## 1st Qu.:0.0000 1st Qu.: 946 1:19562
## Median :0.0000 Median : 1400
## Mean :0.1561 Mean : 3395
## 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :1.0000 Max. :843300
#Note the dataset is fairly balanced in terms of the popularity level of an article:
#0 1
#20082 19562
#Standard deviation for a selection of features
sd(pop_new$n_tokens_title)
## [1] 2.114037
#Standard deviation for the number of words in the title is 2.114037
sd(pop_new$n_tokens_content)
## [1] 471.1075
#Standard deviation for the number of words in the content is 471.1075
sd(pop_new$num_hrefs)
## [1] 11.33202
#Standard deviation for the number of links is 11.33202
sd(pop_new$num_self_hrefs)
## [1] 3.855141
#Standard deviation for the number of links to other articles published by Mashable is 3.855141
sd(pop_new$num_imgs)
## [1] 8.309434
#Standard deviation for the number of images is 8.309434
sd(pop_new$num_videos)
## [1] 4.107855
#Standard deviation for the number of videos is 4.107855
sd(pop_new$num_keywords)
## [1] 1.90913
#Standard deviation for the number of keywords in the metadata is 1.90913
sd(pop_new$shares)
## [1] 11626.95
#Standard deviation for the number of shares is 11626.95
#Change categorical variables back to numeric to conduct further analysis
pop_new$data_channel_is_lifestyle <- as.numeric(levels(pop_new$data_channel_is_lifestyle))[as.integer(pop_new$data_channel_is_lifestyle)]
pop_new$data_channel_is_entertainment <- as.numeric(levels(pop_new$data_channel_is_entertainment))[as.integer(pop_new$data_channel_is_entertainment)]
pop_new$data_channel_is_bus <- as.numeric(levels(pop_new$data_channel_is_bus))[as.integer(pop_new$data_channel_is_bus)]
pop_new$data_channel_is_socmed <- as.numeric(levels(pop_new$data_channel_is_socmed))[as.integer(pop_new$data_channel_is_socmed)]
pop_new$data_channel_is_tech <- as.numeric(levels(pop_new$data_channel_is_tech))[as.integer(pop_new$data_channel_is_tech)]
pop_new$data_channel_is_world <- as.numeric(levels(pop_new$data_channel_is_world))[as.integer(pop_new$data_channel_is_world)]
pop_new$weekday_is_monday <- as.numeric(levels(pop_new$weekday_is_monday))[as.integer(pop_new$weekday_is_monday)]
pop_new$weekday_is_tuesday <- as.numeric(levels(pop_new$weekday_is_tuesday))[as.integer(pop_new$weekday_is_tuesday)]
pop_new$weekday_is_wednesday <- as.numeric(levels(pop_new$weekday_is_wednesday))[as.integer(pop_new$weekday_is_wednesday)]
pop_new$weekday_is_thursday <- as.numeric(levels(pop_new$weekday_is_thursday))[as.integer(pop_new$weekday_is_thursday)]
pop_new$weekday_is_friday <- as.numeric(levels(pop_new$weekday_is_friday))[as.integer(pop_new$weekday_is_friday)]
pop_new$weekday_is_saturday <- as.numeric(levels(pop_new$weekday_is_saturday))[as.integer(pop_new$weekday_is_saturday)]
pop_new$weekday_is_sunday <- as.numeric(levels(pop_new$weekday_is_sunday))[as.integer(pop_new$weekday_is_sunday)]
pop_new$is_weekend <- as.numeric(levels(pop_new$is_weekend))[as.integer(pop_new$is_weekend)]
pop_new$shares_cat <- as.numeric(levels(pop_new$shares_cat))[as.integer(pop_new$shares_cat)]
#Outliers - Boxplots
#Number of shares
boxplot(pop_new[c("shares")], xlab = "Number of Shares", ylab = "Count")
#Number of images and videos
boxplot(pop_new[c("num_imgs", "num_videos")], names = c("Number of Images", "Number of Videos"), ylab = "Count")
#Number of Keywords; Number of Words in the Title; and Number of Words in the Content
par(mfrow=c(1,3))
boxplot(pop_new$num_keywords, xlab = "Number of Keywords", ylab = "Count")
boxplot(pop_new$n_tokens_title, xlab = "Number of Words in the Title", ylab = "Count")
boxplot(pop_new$n_tokens_content, xlab = "Number of Words in the Content", ylab = "Count")
#Visualizing the data using boxplots shows that there are outliers present in the data.
#Distributions of input and output variables - Histograms
#Popularity of an Article
par(mfrow=c(1,1))
hist(pop_new$shares_cat, xlab = "Popularity of an Article", ylab = "Count", breaks = c(-1,0,1), main = NULL,
labels = c("Unpopular", "Popular"))
#The dataset is fairly balanced between unpopular and popular articles.
#Type of data channel that the article was published
library(ggplot2)
tab = data.frame(Count=colSums(pop_new[c("data_channel_is_lifestyle", "data_channel_is_entertainment",
"data_channel_is_bus", "data_channel_is_socmed",
"data_channel_is_tech", "data_channel_is_world")]),
Channel = names(pop_new[c("data_channel_is_lifestyle", "data_channel_is_entertainment",
"data_channel_is_bus", "data_channel_is_socmed",
"data_channel_is_tech", "data_channel_is_world")]))
ggplot(data = tab, aes(x = Channel, y = Count)) +
geom_bar(stat = "identity") +
scale_x_discrete(breaks = c("data_channel_is_lifestyle", "data_channel_is_entertainment",
"data_channel_is_bus", "data_channel_is_socmed",
"data_channel_is_tech", "data_channel_is_world"),
labels = c("Lifestyle", "Entertainment", "Business", "Social Media",
"Technology", "World"))
#Most articles are published to “World” followed by “Technology,” “Entertainment,” “Business,” “Social Media,” and #finally “Lifestyle”.
#Day of the Week that the article was published
tab2 = data.frame(Count=colSums(pop_new[c("weekday_is_monday", "weekday_is_tuesday",
"weekday_is_wednesday", "weekday_is_thursday",
"weekday_is_friday", "weekday_is_saturday",
"weekday_is_sunday")]),
Day = names(pop_new[c("weekday_is_monday", "weekday_is_tuesday",
"weekday_is_wednesday", "weekday_is_thursday",
"weekday_is_friday", "weekday_is_saturday",
"weekday_is_sunday")]))
tab2$Day <- factor(tab2$Day, levels = c("weekday_is_monday", "weekday_is_tuesday",
"weekday_is_wednesday", "weekday_is_thursday",
"weekday_is_friday", "weekday_is_saturday",
"weekday_is_sunday"))
ggplot(data = tab2, aes(x = Day, y = Count)) +
geom_bar(stat = "identity") +
scale_x_discrete(breaks = c("weekday_is_monday", "weekday_is_tuesday",
"weekday_is_wednesday", "weekday_is_thursday",
"weekday_is_friday", "weekday_is_saturday",
"weekday_is_sunday"),
labels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday"))
#Most articles are published on Wednesday, followed by Tuesday, Thursday, Monday, Friday, Sunday, and finally Saturday.
#Initial Analysis - Bivariate Analysis
#Pairwise Visualizations - Scatterplots
#Scatterplots between Shares and Text Subjectivity, Text Sentiment Polarity, Title Subjectivity and Title Polarity
par(mfrow = c(2,2))
plot(pop_new$global_subjectivity, pop_new$shares, xlab = "Text Subjectivity", ylab = "Shares")
plot(pop_new$global_sentiment_polarity, pop_new$shares, xlab = "Text Sentiment Polarity", ylab = "Shares")
plot(pop_new$title_subjectivity, pop_new$shares, xlab = "Title Subjectivity", ylab = "Shares")
plot(pop_new$title_sentiment_polarity, pop_new$shares, xlab = "Title Polarity", ylab = "Shares")
#Shares tend to cluster around the center of the distribution suggesting that the most popular articles tend to be #more neutral.
#Scatterplot matrix
par(mfrow=c(1,1))
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
test <- cor.test(x,y)
#borrowed from printCoefmat
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = cex * r)
text(.8, .8, Signif, cex=cex, col=2)
}
#Assign a value of Selected Features that represents the relevant columns of interest between Shares and Text Subjectivity,
#Text Sentiment Polarity, Title Subjectivity and Title Polarity
pop_new_Selected_Features <- pop_new[c("global_subjectivity", "global_sentiment_polarity",
"title_subjectivity", "title_sentiment_polarity", "shares")]
#Show the results
pairs(pop_new_Selected_Features, lower.panel=panel.smooth, upper.panel=panel.cor)
#Low Variance Filter
library(caret)
## Loading required package: lattice
#Near-zero variance may be applied to the data to ensure variables are used that help distinguish/determine the
#popularity of an article.
#Dataframe with predictor information including whether variables have one unique value, or have very few unique values
#compared to the number of samples, or whether the ratio of the frequency of the the most common value to the second
#most common value is large.
nzv <- nearZeroVar(pop_new, saveMetrics= TRUE)
print(nzv)
## freqRatio percentUnique zeroVar nzv
## n_tokens_title 1.057184 0.05044900 FALSE FALSE
## n_tokens_content 11.693069 6.06901423 FALSE FALSE
## n_unique_tokens 118.100000 68.81495308 FALSE FALSE
## n_non_stop_words 8.557971 3.66007466 FALSE FALSE
## n_non_stop_unique_tokens 90.846154 57.83977399 FALSE FALSE
## num_hrefs 1.011367 0.33548582 FALSE FALSE
## num_self_hrefs 1.123159 0.14882454 FALSE FALSE
## num_imgs 2.593245 0.22954293 FALSE FALSE
## num_videos 2.639038 0.13368984 FALSE FALSE
## average_token_length 15.337662 76.01654727 FALSE FALSE
## num_keywords 1.076606 0.02522450 FALSE FALSE
## data_channel_is_lifestyle 17.887089 0.00504490 FALSE FALSE
## data_channel_is_entertainment 4.617685 0.00504490 FALSE FALSE
## data_channel_is_bus 5.334931 0.00504490 FALSE FALSE
## data_channel_is_socmed 16.065863 0.00504490 FALSE FALSE
## data_channel_is_tech 4.396678 0.00504490 FALSE FALSE
## data_channel_is_world 3.704403 0.00504490 FALSE FALSE
## kw_min_min 1.929795 0.06558369 FALSE FALSE
## kw_max_min 1.268519 2.71415599 FALSE FALSE
## kw_avg_min 8.567901 42.88921400 FALSE FALSE
## kw_min_max 45.139842 2.57542125 FALSE TRUE
## kw_max_max 7.577521 0.08828574 FALSE FALSE
## kw_avg_max 1.385965 77.77721723 FALSE FALSE
## kw_min_avg 111.051948 40.31379276 FALSE FALSE
## kw_max_avg 1.795455 49.03137928 FALSE FALSE
## kw_avg_avg 9.875000 99.13227727 FALSE FALSE
## self_reference_min_shares 4.154024 3.16567450 FALSE FALSE
## self_reference_max_shares 9.366841 2.86802543 FALSE FALSE
## self_reference_avg_sharess 12.585965 21.75865200 FALSE FALSE
## weekday_is_monday 4.951659 0.00504490 FALSE FALSE
## weekday_is_tuesday 4.364547 0.00504490 FALSE FALSE
## weekday_is_wednesday 4.332078 0.00504490 FALSE FALSE
## weekday_is_thursday 4.455346 0.00504490 FALSE FALSE
## weekday_is_friday 5.953868 0.00504490 FALSE FALSE
## weekday_is_saturday 15.161435 0.00504490 FALSE FALSE
## weekday_is_sunday 13.484472 0.00504490 FALSE FALSE
## is_weekend 6.638536 0.00504490 FALSE FALSE
## LDA_00 3.000000 99.22560791 FALSE FALSE
## LDA_01 1.416667 98.62274241 FALSE FALSE
## LDA_02 8.500000 99.69982847 FALSE FALSE
## LDA_03 1.137255 98.28221168 FALSE FALSE
## LDA_04 2.684211 99.30884875 FALSE FALSE
## global_subjectivity 32.888889 87.02704066 FALSE FALSE
## global_sentiment_polarity 39.900000 87.51639592 FALSE FALSE
## global_rate_positive_words 10.141667 33.19291696 FALSE FALSE
## global_rate_negative_words 29.310345 25.90808193 FALSE FALSE
## rate_positive_words 1.178886 5.76127535 FALSE FALSE
## rate_negative_words 1.585821 5.76127535 FALSE FALSE
## avg_positive_polarity 6.054726 68.86540208 FALSE FALSE
## min_positive_polarity 2.086363 0.08324084 FALSE FALSE
## max_positive_polarity 2.238484 0.09585309 FALSE FALSE
## avg_negative_polarity 5.110220 34.91322773 FALSE FALSE
## min_negative_polarity 1.241491 0.13621229 FALSE FALSE
## max_negative_polarity 1.128019 0.12360004 FALSE FALSE
## title_subjectivity 6.817941 1.69760872 FALSE FALSE
## title_sentiment_polarity 9.428977 2.05075169 FALSE FALSE
## abs_title_subjectivity 7.771093 1.34194330 FALSE FALSE
## abs_title_sentiment_polarity 7.492099 1.64715972 FALSE FALSE
## shares 1.143707 3.66764201 FALSE FALSE
## shares_cat 1.026582 0.00504490 FALSE FALSE
# freqRatio percentUnique zeroVar nzv
#n_tokens_title 1.057184 0.05044900 FALSE FALSE
#n_tokens_content 11.693069 6.06901423 FALSE FALSE
#n_unique_tokens 118.100000 68.81495308 FALSE FALSE
#n_non_stop_words 8.557971 3.66007466 FALSE FALSE
#n_non_stop_unique_tokens 90.846154 57.83977399 FALSE FALSE
#num_hrefs 1.011367 0.33548582 FALSE FALSE
#num_self_hrefs 1.123159 0.14882454 FALSE FALSE
#num_imgs 2.593245 0.22954293 FALSE FALSE
#num_videos 2.639038 0.13368984 FALSE FALSE
#average_token_length 15.337662 76.01654727 FALSE FALSE
#num_keywords 1.076606 0.02522450 FALSE FALSE
#data_channel_is_lifestyle 17.887089 0.00504490 FALSE FALSE
#data_channel_is_entertainment 4.617685 0.00504490 FALSE FALSE
#data_channel_is_bus 5.334931 0.00504490 FALSE FALSE
#data_channel_is_socmed 16.065863 0.00504490 FALSE FALSE
#data_channel_is_tech 4.396678 0.00504490 FALSE FALSE
#data_channel_is_world 3.704403 0.00504490 FALSE FALSE
#kw_min_min 1.929795 0.06558369 FALSE FALSE
#kw_max_min 1.268519 2.71415599 FALSE FALSE
#kw_avg_min 8.567901 42.88921400 FALSE FALSE
#kw_min_max 45.139842 2.57542125 FALSE TRUE
#kw_max_max 7.577521 0.08828574 FALSE FALSE
#kw_avg_max 1.385965 77.77721723 FALSE FALSE
#kw_min_avg 111.051948 40.31379276 FALSE FALSE
#kw_max_avg 1.795455 49.03137928 FALSE FALSE
#kw_avg_avg 9.875000 99.13227727 FALSE FALSE
#self_reference_min_shares 4.154024 3.16567450 FALSE FALSE
#self_reference_max_shares 9.366841 2.86802543 FALSE FALSE
#self_reference_avg_sharess 12.585965 21.75865200 FALSE FALSE
#weekday_is_monday 4.951659 0.00504490 FALSE FALSE
#weekday_is_tuesday 4.364547 0.00504490 FALSE FALSE
#weekday_is_wednesday 4.332078 0.00504490 FALSE FALSE
#weekday_is_thursday 4.455346 0.00504490 FALSE FALSE
#weekday_is_friday 5.953868 0.00504490 FALSE FALSE
#weekday_is_saturday 15.161435 0.00504490 FALSE FALSE
#weekday_is_sunday 13.484472 0.00504490 FALSE FALSE
#is_weekend 6.638536 0.00504490 FALSE FALSE
#LDA_00 3.000000 99.22560791 FALSE FALSE
#LDA_01 1.416667 98.62274241 FALSE FALSE
#LDA_02 8.500000 99.69982847 FALSE FALSE
#LDA_03 1.137255 98.28221168 FALSE FALSE
#LDA_04 2.684211 99.30884875 FALSE FALSE
#global_subjectivity 32.888889 87.02704066 FALSE FALSE
#global_sentiment_polarity 39.900000 87.51639592 FALSE FALSE
#global_rate_positive_words 10.141667 33.19291696 FALSE FALSE
#global_rate_negative_words 29.310345 25.90808193 FALSE FALSE
#rate_positive_words 1.178886 5.76127535 FALSE FALSE
#rate_negative_words 1.585821 5.76127535 FALSE FALSE
#avg_positive_polarity 6.054726 68.86540208 FALSE FALSE
#min_positive_polarity 2.086363 0.08324084 FALSE FALSE
#max_positive_polarity 2.238484 0.09585309 FALSE FALSE
#avg_negative_polarity 5.110220 34.91322773 FALSE FALSE
#min_negative_polarity 1.241491 0.13621229 FALSE FALSE
#max_negative_polarity 1.128019 0.12360004 FALSE FALSE
#title_subjectivity 6.817941 1.69760872 FALSE FALSE
#title_sentiment_polarity 9.428977 2.05075169 FALSE FALSE
#abs_title_subjectivity 7.771093 1.34194330 FALSE FALSE
#abs_title_sentiment_polarity 7.492099 1.64715972 FALSE FALSE
#shares 1.143707 3.66764201 FALSE FALSE
#shares_cat 1.026582 0.00504490 FALSE FALSE
#Column names of the zero- or near-zero predictors
nzv2 <- nearZeroVar(pop_new, names = TRUE)
print(nzv2)
## [1] "kw_min_max"
#The results indicate one predictor - the minimum shares of the best keywords - has low variance.
#Return column index
nzv3 <- nearZeroVar(pop_new)
print(nzv3)
## [1] 21
#Drop column with low variance
filtered_pop <- pop_new[, -nzv3]
#Dimensions of new data frame
dim(filtered_pop)
## [1] 39644 59
#39644 rows and 59 columns.
#Remove the number of shares since the analysis will focus on predicting the popularity of online news articles as a
#binary classification task.
#Check the column names and remove the shares (integer) column
names(filtered_pop)
## [1] "n_tokens_title" "n_tokens_content"
## [3] "n_unique_tokens" "n_non_stop_words"
## [5] "n_non_stop_unique_tokens" "num_hrefs"
## [7] "num_self_hrefs" "num_imgs"
## [9] "num_videos" "average_token_length"
## [11] "num_keywords" "data_channel_is_lifestyle"
## [13] "data_channel_is_entertainment" "data_channel_is_bus"
## [15] "data_channel_is_socmed" "data_channel_is_tech"
## [17] "data_channel_is_world" "kw_min_min"
## [19] "kw_max_min" "kw_avg_min"
## [21] "kw_max_max" "kw_avg_max"
## [23] "kw_min_avg" "kw_max_avg"
## [25] "kw_avg_avg" "self_reference_min_shares"
## [27] "self_reference_max_shares" "self_reference_avg_sharess"
## [29] "weekday_is_monday" "weekday_is_tuesday"
## [31] "weekday_is_wednesday" "weekday_is_thursday"
## [33] "weekday_is_friday" "weekday_is_saturday"
## [35] "weekday_is_sunday" "is_weekend"
## [37] "LDA_00" "LDA_01"
## [39] "LDA_02" "LDA_03"
## [41] "LDA_04" "global_subjectivity"
## [43] "global_sentiment_polarity" "global_rate_positive_words"
## [45] "global_rate_negative_words" "rate_positive_words"
## [47] "rate_negative_words" "avg_positive_polarity"
## [49] "min_positive_polarity" "max_positive_polarity"
## [51] "avg_negative_polarity" "min_negative_polarity"
## [53] "max_negative_polarity" "title_subjectivity"
## [55] "title_sentiment_polarity" "abs_title_subjectivity"
## [57] "abs_title_sentiment_polarity" "shares"
## [59] "shares_cat"
filtered_pop <- filtered_pop[-58]
#Correlation Analysis
library(corrplot)
## corrplot 0.84 loaded
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.2
#Use Spearman correlation coefficient because the data is not normally distributed and has categorical variables
pop_new_cor <- cor(filtered_pop, use = "complete.obs", method = "spearman")
#Since the dataset has many variables, it may be informative to reduce the size of the correlation matrix
#Drop duplicate correlations
pop_new_cor[lower.tri(pop_new_cor, diag=TRUE)] <- NA
#Create a table
pop_new_cor <- as.data.frame(as.table(pop_new_cor))
#Remove NA values
pop_new_cor <- na.omit(pop_new_cor)
#Select the correlations that have at least moderate (moderate correlation > 0.4)
pop_new_cor <- subset(pop_new_cor, abs(Freq) > 0.4)
#Sort values by the highest correlation
pop_new_cor <- pop_new_cor[order(-abs(pop_new_cor$Freq)), ]
#Print the table
print(pop_new_cor)
## Var1 Var2 Freq
## 176 n_tokens_content n_non_stop_words 0.9949854
## 1593 self_reference_max_shares self_reference_avg_sharess 0.9731616
## 235 n_unique_tokens n_non_stop_unique_tokens 0.8813053
## 3302 title_subjectivity abs_title_sentiment_polarity 0.8394189
## 2714 rate_positive_words rate_negative_words -0.8320259
## 2713 global_rate_negative_words rate_negative_words 0.8243605
## 1416 kw_max_avg kw_avg_avg 0.8070733
## 1121 kw_max_min kw_avg_min 0.8066579
## 2653 global_sentiment_polarity rate_positive_words 0.8032534
## 1592 self_reference_min_shares self_reference_avg_sharess 0.8020695
## 3009 avg_negative_polarity min_negative_polarity 0.7791302
## 1178 kw_min_min kw_max_max -0.7493595
## 118 n_tokens_content n_unique_tokens -0.7204282
## 177 n_unique_tokens n_non_stop_words -0.7111355
## 2065 weekday_is_sunday is_weekend 0.7016478
## 1534 self_reference_min_shares self_reference_max_shares 0.6808056
## 2221 data_channel_is_world LDA_02 0.6758580
## 3304 abs_title_subjectivity abs_title_sentiment_polarity -0.6748752
## 3244 title_subjectivity abs_title_subjectivity -0.6739325
## 2711 global_sentiment_polarity rate_negative_words -0.6632427
## 2064 weekday_is_saturday is_weekend 0.6617074
## 2655 global_rate_negative_words rate_positive_words -0.6563685
## 2336 data_channel_is_tech LDA_04 0.6152844
## 2890 avg_positive_polarity max_positive_polarity 0.6004516
## 2537 global_sentiment_polarity global_rate_positive_words 0.5922442
## 2102 data_channel_is_bus LDA_00 0.5920692
## 2654 global_rate_positive_words rate_positive_words 0.5638215
## 236 n_non_stop_words n_non_stop_unique_tokens -0.5587888
## 1415 kw_min_avg kw_avg_avg 0.5539081
## 1515 num_self_hrefs self_reference_max_shares 0.5423319
## 234 n_tokens_content n_non_stop_unique_tokens -0.5370046
## 294 n_non_stop_words num_hrefs 0.5334565
## 1239 kw_max_max kw_avg_max 0.5277993
## 3003 global_rate_negative_words min_negative_polarity -0.5228170
## 2960 n_tokens_content min_negative_polarity -0.5158222
## 2962 n_non_stop_words min_negative_polarity -0.5150513
## 1414 kw_avg_max kw_avg_avg 0.5094611
## 1236 kw_min_min kw_avg_max -0.5067561
## 292 n_tokens_content num_hrefs 0.5061764
## 2769 global_sentiment_polarity avg_positive_polarity 0.4910328
## 1238 kw_avg_min kw_avg_max -0.4872756
## 2157 num_keywords LDA_01 -0.4793513
## 2846 n_non_stop_words max_positive_polarity 0.4691861
## 1180 kw_avg_min kw_max_max -0.4689681
## 2099 num_keywords LDA_00 -0.4679866
## 2844 n_tokens_content max_positive_polarity 0.4677236
## 2287 kw_avg_avg LDA_03 0.4644048
## 2768 global_subjectivity avg_positive_polarity 0.4547775
## 3005 rate_negative_words min_negative_polarity -0.4499243
## 411 n_non_stop_unique_tokens num_imgs -0.4448630
## 2595 global_sentiment_polarity global_rate_negative_words -0.4439893
## 3303 title_sentiment_polarity abs_title_sentiment_polarity 0.4405846
## 1573 num_self_hrefs self_reference_avg_sharess 0.4381096
## 2159 data_channel_is_entertainment LDA_01 0.4295302
## 1120 kw_min_min kw_avg_min 0.4234541
## 2787 n_unique_tokens min_positive_polarity 0.4224206
## 2885 global_sentiment_polarity max_positive_polarity 0.4195723
## 410 n_non_stop_words num_imgs 0.4145083
## 2886 global_rate_positive_words max_positive_polarity 0.4056356
# Var1 Var2 Freq
#176 n_tokens_content n_non_stop_words 0.9949854
#1593 self_reference_max_shares self_reference_avg_sharess 0.9731616
#235 n_unique_tokens n_non_stop_unique_tokens 0.8813053
#3302 title_subjectivity abs_title_sentiment_polarity 0.8394189
#2714 rate_positive_words rate_negative_words -0.8320259
#2713 global_rate_negative_words rate_negative_words 0.8243605
#1416 kw_max_avg kw_avg_avg 0.8070733
#1121 kw_max_min kw_avg_min 0.8066579
#2653 global_sentiment_polarity rate_positive_words 0.8032534
#1592 self_reference_min_shares self_reference_avg_sharess 0.8020695
#3009 avg_negative_polarity min_negative_polarity 0.7791302
#1178 kw_min_min kw_max_max -0.7493595
#118 n_tokens_content n_unique_tokens -0.7204282
#177 n_unique_tokens n_non_stop_words -0.7111355
#2065 weekday_is_sunday is_weekend 0.7016478
#1534 self_reference_min_shares self_reference_max_shares 0.6808056
#2221 data_channel_is_world LDA_02 0.6758580
#3304 abs_title_subjectivity abs_title_sentiment_polarity -0.6748752
#3244 title_subjectivity abs_title_subjectivity -0.6739325
#2711 global_sentiment_polarity rate_negative_words -0.6632427
#2064 weekday_is_saturday is_weekend 0.6617074
#2655 global_rate_negative_words rate_positive_words -0.6563685
#2336 data_channel_is_tech LDA_04 0.6152844
#2890 avg_positive_polarity max_positive_polarity 0.6004516
#2537 global_sentiment_polarity global_rate_positive_words 0.5922442
#2102 data_channel_is_bus LDA_00 0.5920692
#2654 global_rate_positive_words rate_positive_words 0.5638215
#236 n_non_stop_words n_non_stop_unique_tokens -0.5587888
#1415 kw_min_avg kw_avg_avg 0.5539081
#1515 num_self_hrefs self_reference_max_shares 0.5423319
#234 n_tokens_content n_non_stop_unique_tokens -0.5370046
#294 n_non_stop_words num_hrefs 0.5334565
#1239 kw_max_max kw_avg_max 0.5277993
#3003 global_rate_negative_words min_negative_polarity -0.5228170
#2960 n_tokens_content min_negative_polarity -0.5158222
#2962 n_non_stop_words min_negative_polarity -0.5150513
#1414 kw_avg_max kw_avg_avg 0.5094611
#1236 kw_min_min kw_avg_max -0.5067561
#292 n_tokens_content num_hrefs 0.5061764
#2769 global_sentiment_polarity avg_positive_polarity 0.4910328
#1238 kw_avg_min kw_avg_max -0.4872756
#2157 num_keywords LDA_01 -0.4793513
#2846 n_non_stop_words max_positive_polarity 0.4691861
#1180 kw_avg_min kw_max_max -0.4689681
#2099 num_keywords LDA_00 -0.4679866
#2844 n_tokens_content max_positive_polarity 0.4677236
#2287 kw_avg_avg LDA_03 0.4644048
#2768 global_subjectivity avg_positive_polarity 0.4547775
#3005 rate_negative_words min_negative_polarity -0.4499243
#411 n_non_stop_unique_tokens num_imgs -0.4448630
#2595 global_sentiment_polarity global_rate_negative_words -0.4439893
#3303 title_sentiment_polarity abs_title_sentiment_polarity 0.4405846
#1573 num_self_hrefs self_reference_avg_sharess 0.4381096
#2159 data_channel_is_entertainment LDA_01 0.4295302
#1120 kw_min_min kw_avg_min 0.4234541
#2787 n_unique_tokens min_positive_polarity 0.4224206
#2885 global_sentiment_polarity max_positive_polarity 0.4195723
#410 n_non_stop_words num_imgs 0.4145083
#2886 global_rate_positive_words max_positive_polarity 0.4056356
#Transform the table to a matrix to use corrplot
matrix_cor <- reshape2::acast(pop_new_cor, Var1~Var2, value.var="Freq")
#Plot the correlations
corrplot(matrix_cor, method = c("circle"), is.corr = FALSE, tl.col="black", na.label=" ")
#Remove strongly correlated attributes
library(mlbench)
#Calculate correlation
pop_correlation <- cor(filtered_pop, method = "spearman")
#Find attributes that are highly correlated (greater than 0.7 correlation coefficient) and sort them
correlated <- findCorrelation(pop_correlation, cutoff=0.7)
correlated <- sort(correlated)
#Print the indexes of the highly correlated attributes
print(correlated)
## [1] 2 3 4 18 20 25 27 28 36 43 46 47 52 57
#Check the names of the correlated data
correlated_pop = filtered_pop[,c(correlated)]
names(correlated_pop)
## [1] "n_tokens_content" "n_unique_tokens"
## [3] "n_non_stop_words" "kw_min_min"
## [5] "kw_avg_min" "kw_avg_avg"
## [7] "self_reference_max_shares" "self_reference_avg_sharess"
## [9] "is_weekend" "global_sentiment_polarity"
## [11] "rate_positive_words" "rate_negative_words"
## [13] "min_negative_polarity" "abs_title_sentiment_polarity"
#[1] "n_tokens_content" "n_unique_tokens" "n_non_stop_words"
#[4] "kw_min_min" "kw_avg_min" "kw_avg_avg"
#[7] "self_reference_max_shares" "self_reference_avg_sharess" "is_weekend"
#[10] "global_sentiment_polarity" "rate_positive_words" "rate_negative_words"
#[13] "min_negative_polarity" "abs_title_sentiment_polarity"
#There are 14 variables that may be removed due to being strongly correlated: number of words in the content, rate of
#unique words in the content, rate of non-stop words in the content, minimum shares of the worst keyword, average
#shares of the worst keyword, average shares of the average keyword, maximum shares of referenced articles in
#Mashable, average shares of referenced articles in Mashable, whether the article was published on the weekend,
#text sentiment polarity, rate of positive words among non-neutral tokens, rate of negative words among non-neutral
#tokens, minimum polarity of negative words, and absolute polarity level.
#Reduce the data, not including variables that have high correlation
reduced_pop = filtered_pop[,-c(correlated)]
#Check the names of the remaining variables
names(reduced_pop)
## [1] "n_tokens_title" "n_non_stop_unique_tokens"
## [3] "num_hrefs" "num_self_hrefs"
## [5] "num_imgs" "num_videos"
## [7] "average_token_length" "num_keywords"
## [9] "data_channel_is_lifestyle" "data_channel_is_entertainment"
## [11] "data_channel_is_bus" "data_channel_is_socmed"
## [13] "data_channel_is_tech" "data_channel_is_world"
## [15] "kw_max_min" "kw_max_max"
## [17] "kw_avg_max" "kw_min_avg"
## [19] "kw_max_avg" "self_reference_min_shares"
## [21] "weekday_is_monday" "weekday_is_tuesday"
## [23] "weekday_is_wednesday" "weekday_is_thursday"
## [25] "weekday_is_friday" "weekday_is_saturday"
## [27] "weekday_is_sunday" "LDA_00"
## [29] "LDA_01" "LDA_02"
## [31] "LDA_03" "LDA_04"
## [33] "global_subjectivity" "global_rate_positive_words"
## [35] "global_rate_negative_words" "avg_positive_polarity"
## [37] "min_positive_polarity" "max_positive_polarity"
## [39] "avg_negative_polarity" "max_negative_polarity"
## [41] "title_subjectivity" "title_sentiment_polarity"
## [43] "abs_title_subjectivity" "shares_cat"
#There are 44 remaining variables to be included in the analysis.
#[1] "n_tokens_title" "n_non_stop_unique_tokens" "num_hrefs"
#[4] "num_self_hrefs" "num_imgs" "num_videos"
#[7] "average_token_length" "num_keywords" "data_channel_is_lifestyle"
#[10] "data_channel_is_entertainment" "data_channel_is_bus" "data_channel_is_socmed"
#[13] "data_channel_is_tech" "data_channel_is_world" "kw_max_min"
#[16] "kw_max_max" "kw_avg_max" "kw_min_avg"
#[19] "kw_max_avg" "self_reference_min_shares" "weekday_is_monday"
#[22] "weekday_is_tuesday" "weekday_is_wednesday" "weekday_is_thursday"
#[25] "weekday_is_friday" "weekday_is_saturday" "weekday_is_sunday"
#[28] "LDA_00" "LDA_01" "LDA_02"
#[31] "LDA_03" "LDA_04" "global_subjectivity"
#[34] "global_rate_positive_words" "global_rate_negative_words" "avg_positive_polarity"
#[37] "min_positive_polarity" "max_positive_polarity" "avg_negative_polarity"
#[40] "max_negative_polarity" "title_subjectivity" "title_sentiment_polarity"
#[43] "abs_title_subjectivity" "shares_cat"
#Change the categorical variables back to factor to conduct exploratory analysis and dimensionality reduction
reduced_pop$data_channel_is_lifestyle <- as.factor(reduced_pop$data_channel_is_lifestyle)
reduced_pop$data_channel_is_entertainment <- as.factor(reduced_pop$data_channel_is_entertainment)
reduced_pop$data_channel_is_bus <- as.factor(reduced_pop$data_channel_is_bus)
reduced_pop$data_channel_is_socmed <- as.factor(reduced_pop$data_channel_is_socmed)
reduced_pop$data_channel_is_tech <- as.factor(reduced_pop$data_channel_is_tech)
reduced_pop$data_channel_is_world <- as.factor(reduced_pop$data_channel_is_world)
reduced_pop$weekday_is_monday <- as.factor(reduced_pop$weekday_is_monday)
reduced_pop$weekday_is_tuesday <- as.factor(reduced_pop$weekday_is_tuesday)
reduced_pop$weekday_is_wednesday <- as.factor(reduced_pop$weekday_is_wednesday)
reduced_pop$weekday_is_thursday <- as.factor(reduced_pop$weekday_is_thursday)
reduced_pop$weekday_is_friday <- as.factor(reduced_pop$weekday_is_friday)
reduced_pop$weekday_is_saturday <- as.factor(reduced_pop$weekday_is_saturday)
reduced_pop$weekday_is_sunday <- as.factor(reduced_pop$weekday_is_sunday)
reduced_pop$shares_cat <- as.factor(reduced_pop$shares_cat)
#Step 2 - Exploratory Analysis
#Normalization is applied to the dataset to scale the values of the features. Next, the data may be explored by #checking subsequences using association rules to find any patterns that may explain what makes an article popular, and #learn its characteristics, without making predictions yet.
#Normalization - normalize the numeric features in the data set
#Use a function to normalize the data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
#Determine the categorical variables
categorical = apply(reduced_pop, 2, function(x){all(x %in% 0:1)})
print(categorical)
## n_tokens_title n_non_stop_unique_tokens
## FALSE FALSE
## num_hrefs num_self_hrefs
## FALSE FALSE
## num_imgs num_videos
## FALSE FALSE
## average_token_length num_keywords
## FALSE FALSE
## data_channel_is_lifestyle data_channel_is_entertainment
## TRUE TRUE
## data_channel_is_bus data_channel_is_socmed
## TRUE TRUE
## data_channel_is_tech data_channel_is_world
## TRUE TRUE
## kw_max_min kw_max_max
## FALSE FALSE
## kw_avg_max kw_min_avg
## FALSE FALSE
## kw_max_avg self_reference_min_shares
## FALSE FALSE
## weekday_is_monday weekday_is_tuesday
## TRUE TRUE
## weekday_is_wednesday weekday_is_thursday
## TRUE TRUE
## weekday_is_friday weekday_is_saturday
## TRUE TRUE
## weekday_is_sunday LDA_00
## TRUE FALSE
## LDA_01 LDA_02
## FALSE FALSE
## LDA_03 LDA_04
## FALSE FALSE
## global_subjectivity global_rate_positive_words
## FALSE FALSE
## global_rate_negative_words avg_positive_polarity
## FALSE FALSE
## min_positive_polarity max_positive_polarity
## FALSE FALSE
## avg_negative_polarity max_negative_polarity
## FALSE FALSE
## title_subjectivity title_sentiment_polarity
## FALSE FALSE
## abs_title_subjectivity shares_cat
## FALSE TRUE
#Apply the normalize function to the numeric variables
pop_norm <- as.data.frame(lapply(reduced_pop[!categorical], normalize))
#Combine the numeric and categorical variables back into one data set
pop_norm <- cbind(pop_norm, reduced_pop[categorical])
#Check the five number summary and frequency tables for the normalized dataset
summary(pop_norm)
## n_tokens_title n_non_stop_unique_tokens num_hrefs num_self_hrefs
## Min. :0.0000 Min. :0.0000000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.3333 1st Qu.:0.0009627 1st Qu.:0.01316 1st Qu.:0.008621
## Median :0.3810 Median :0.0010623 Median :0.02632 Median :0.025862
## Mean :0.3999 Mean :0.0010603 Mean :0.03580 Mean :0.028393
## 3rd Qu.:0.4762 3rd Qu.:0.0011610 3rd Qu.:0.04605 3rd Qu.:0.034483
## Max. :1.0000 Max. :1.0000000 Max. :1.00000 Max. :1.000000
## num_imgs num_videos average_token_length num_keywords
## Min. :0.000000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.007812 1st Qu.:0.00000 1st Qu.:0.5569 1st Qu.:0.5556
## Median :0.007812 Median :0.00000 Median :0.5800 Median :0.6667
## Mean :0.035501 Mean :0.01373 Mean :0.5656 Mean :0.6915
## 3rd Qu.:0.031250 3rd Qu.:0.01099 3rd Qu.:0.6037 3rd Qu.:0.8889
## Max. :1.000000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## kw_max_min kw_max_max kw_avg_max kw_min_avg
## Min. :0.000000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.001491 1st Qu.:1.0000 1st Qu.:0.2050 1st Qu.:0.0002767
## Median :0.002212 Median :1.0000 Median :0.2900 Median :0.2835153
## Mean :0.003867 Mean :0.8921 Mean :0.3075 Mean :0.3093897
## 3rd Qu.:0.003351 3rd Qu.:1.0000 3rd Qu.:0.3925 3rd Qu.:0.5693853
## Max. :1.000000 Max. :1.0000 Max. :1.0000 Max. :1.0000000
## kw_max_avg self_reference_min_shares LDA_00
## Min. :0.00000 Min. :0.0000000 Min. :0.00000
## 1st Qu.:0.01194 1st Qu.:0.0007577 1st Qu.:0.02702
## Median :0.01460 Median :0.0014230 Median :0.03602
## Mean :0.01896 Mean :0.0047418 Mean :0.19914
## 3rd Qu.:0.02017 3rd Qu.:0.0030831 3rd Qu.:0.25993
## Max. :1.00000 Max. :1.0000000 Max. :1.00000
## LDA_01 LDA_02 LDA_03 LDA_04
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.02701 1st Qu.:0.03106 1st Qu.:0.03084 1st Qu.:0.03082
## Median :0.03601 Median :0.04348 Median :0.04317 Median :0.04393
## Mean :0.15255 Mean :0.23513 Mean :0.24151 Mean :0.25241
## 3rd Qu.:0.16289 3rd Qu.:0.36328 3rd Qu.:0.40556 3rd Qu.:0.43140
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## global_subjectivity global_rate_positive_words global_rate_negative_words
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.3962 1st Qu.:0.1825 1st Qu.:0.05199
## Median :0.4535 Median :0.2510 Median :0.08294
## Mean :0.4434 Mean :0.2548 Mean :0.08983
## 3rd Qu.:0.5083 3rd Qu.:0.3234 3rd Qu.:0.11755
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## avg_positive_polarity min_positive_polarity max_positive_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05000 1st Qu.:0.6000
## Median :0.3588 Median :0.10000 Median :0.8000
## Mean :0.3538 Mean :0.09545 Mean :0.7567
## 3rd Qu.:0.4114 3rd Qu.:0.10000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## avg_negative_polarity max_negative_polarity title_subjectivity
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6716 1st Qu.:0.8750 1st Qu.:0.0000
## Median :0.7467 Median :0.9000 Median :0.1500
## Mean :0.7405 Mean :0.8925 Mean :0.2824
## 3rd Qu.:0.8131 3rd Qu.:0.9500 3rd Qu.:0.5000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## title_sentiment_polarity abs_title_subjectivity data_channel_is_lifestyle
## Min. :0.0000 Min. :0.0000 0:37545
## 1st Qu.:0.5000 1st Qu.:0.3333 1: 2099
## Median :0.5000 Median :1.0000
## Mean :0.5357 Mean :0.6837
## 3rd Qu.:0.5750 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
## 0:32587 0:33386 0:37321
## 1: 7057 1: 6258 1: 2323
##
##
##
##
## data_channel_is_tech data_channel_is_world weekday_is_monday
## 0:32298 0:31217 0:32983
## 1: 7346 1: 8427 1: 6661
##
##
##
##
## weekday_is_tuesday weekday_is_wednesday weekday_is_thursday weekday_is_friday
## 0:32254 0:32209 0:32377 0:33943
## 1: 7390 1: 7435 1: 7267 1: 5701
##
##
##
##
## weekday_is_saturday weekday_is_sunday shares_cat
## 0:37191 0:36907 0:20082
## 1: 2453 1: 2737 1:19562
##
##
##
##
#Check subsequences - Association rules
#install.packages("arules")
library(arules)
## Warning: package 'arules' was built under R version 3.6.2
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
#install.packages("arulesViz")
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
#install.packages("plyr")
library(plyr)
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("arulesCBA")
library(arulesCBA)
## Warning: package 'arulesCBA' was built under R version 3.6.2
#Extract rules using apriori algorithm, setting minimum support and confidence.
#First, subset the data to only those that are popular
mostpopular <- subset(pop_norm, pop_norm$shares_cat == 1)
#Remove the popularity level of shares column since all the observations are for popular articles
mostpopular <- mostpopular[-44]
#Change the dataframe into transaction data
transaction_object <- as(mostpopular, "transactions")
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
## 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 not logical or factor.
## Applying default discretization (see '? discretizeDF').
## Warning in discretize(x = c(0, 0, 0, 0, 0.230769230769231, 0, 0.010989010989011, : The calculated breaks are: 0, 0, 0.010989010989011, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, : The calculated breaks are: 0, 1, 1, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(1, 0.8, 1, 0.7, 1, 0.6, 1, 1, 0.5, 1, 1, 0.6, : The calculated breaks are: 0, 0.7, 1, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(0, 0.125, 0, 0, 1, 0.75, 0.566666666667, 0, : The calculated breaks are: 0, 0, 0.451443001443, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(1, 0.75, 1, 1, 1, 0.5, 0.1333333333334, 1, 0.714285714286, : The calculated breaks are: 0, 0.466666666666, 1, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
#Check the transaction data
glimpse(transaction_object)
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## ..@ itemInfo :'data.frame': 110 obs. of 3 variables:
## .. ..$ labels : chr [1:110] "n_tokens_title=[0.0476,0.333)" "n_tokens_title=[0.333,0.429)" "n_tokens_title=[0.429,1]" "n_non_stop_unique_tokens=[0,0.000987)" ...
## .. ..$ variables: Factor w/ 43 levels "abs_title_subjectivity",..: 28 28 28 27 27 27 29 29 29 32 ...
## .. ..$ levels : Factor w/ 86 levels "[0,0.000277)",..: 52 65 69 2 31 32 9 42 50 8 ...
## ..@ itemsetInfo:'data.frame': 19562 obs. of 1 variable:
## .. ..$ transactionID: chr [1:19562] "3" "9" "11" "12" ...
summary(transaction_object)
## transactions as itemMatrix in sparse format with
## 19562 rows (elements/itemsets/transactions) and
## 110 columns (items) and a density of 0.3909091
##
## most frequent items:
## kw_max_max=[0,1] data_channel_is_lifestyle=0
## 19562 18362
## data_channel_is_socmed=0 weekday_is_saturday=0
## 17903 17842
## weekday_is_sunday=0 (Other)
## 17801 749696
##
## element (itemset/transaction) length distribution:
## sizes
## 43
## 19562
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 43 43 43 43 43 43
##
## includes extended item information - examples:
## labels variables levels
## 1 n_tokens_title=[0.0476,0.333) n_tokens_title [0.0476,0.333)
## 2 n_tokens_title=[0.333,0.429) n_tokens_title [0.333,0.429)
## 3 n_tokens_title=[0.429,1] n_tokens_title [0.429,1]
##
## includes extended transaction information - examples:
## transactionID
## 1 3
## 2 9
## 3 11
#Create an item frequency plot to view the distributions for the top 20 items
if (!require("RColorBrewer")) {
#Install the color package of R
install.packages("RColorBrewer")
#Use the library RColorBrewer
library(RColorBrewer)
}
## Loading required package: RColorBrewer
#Absolute Item Frequency Plot
itemFrequencyPlot(transaction_object, topN=20, type="absolute", col=brewer.pal(8,'Pastel2'),
main="Absolute Item Frequency Plot")
#Relative Item Frequency Plot
itemFrequencyPlot(transaction_object, topN=20, type="relative", col=brewer.pal(8,'Pastel2'),
main="Relative Item Frequency Plot")
#The most frequently occurring items are the maximum shares of the best keyword followed by whether the data channel is #lifestyle, the data channel is social media, the article was posted on Saturday, the article was posted on Sunday, the #data channel is entertainment, the article was posted on Friday, data channel is world, data channel is business, the #article was posted on Monday, and so on.
#Create association rules using the apriori algorithm
#Use minimum support = 0.5, and minimum confidence = 0.9
association.rules <- apriori(transaction_object, parameter = list(supp=0.5, conf=0.9, minlen=2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.5 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9781
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[110 item(s), 19562 transaction(s)] done [0.08s].
## sorting and recoding items ... [21 item(s)] done [0.01s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2 3 4 5 6 done [0.10s].
## writing ... [2763 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
summary(association.rules)
## set of 2763 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 85 509 1085 889 195
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 4.000 4.217 5.000 6.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.5001 Min. :0.9000 Min. :0.5001 Min. :0.9603
## 1st Qu.:0.5220 1st Qu.:0.9130 1st Qu.:0.5572 1st Qu.:0.9912
## Median :0.5595 Median :0.9315 Median :0.5867 Median :1.0000
## Mean :0.5822 Mean :0.9498 Mean :0.6141 Mean :0.9956
## 3rd Qu.:0.6232 3rd Qu.:1.0000 3rd Qu.:0.6522 3rd Qu.:1.0000
## Max. :0.9387 Max. :1.0000 Max. :1.0000 Max. :1.0556
## count
## Min. : 9782
## 1st Qu.:10212
## Median :10944
## Mean :11389
## 3rd Qu.:12191
## Max. :18362
##
## mining info:
## data ntransactions support confidence
## transaction_object 19562 0.5 0.9
#Check the first 20 rules
inspect(association.rules[1:20])
## lhs rhs support confidence coverage lift count
## [1] {title_sentiment_polarity=[0.5,0.55)} => {kw_max_max=[0,1]} 0.5292404 1.0000000 0.5292404 1.0000000 10353
## [2] {min_positive_polarity=[0.1,1]} => {weekday_is_sunday=0} 0.5311829 0.9117312 0.5826091 1.0019260 10391
## [3] {min_positive_polarity=[0.1,1]} => {weekday_is_saturday=0} 0.5334833 0.9156796 0.5826091 1.0039527 10436
## [4] {min_positive_polarity=[0.1,1]} => {data_channel_is_socmed=0} 0.5544423 0.9516539 0.5826091 1.0398399 10846
## [5] {min_positive_polarity=[0.1,1]} => {data_channel_is_lifestyle=0} 0.5439628 0.9336668 0.5826091 0.9946841 10641
## [6] {min_positive_polarity=[0.1,1]} => {kw_max_max=[0,1]} 0.5826091 1.0000000 0.5826091 1.0000000 11397
## [7] {num_videos=[0,0.011)} => {data_channel_is_entertainment=0} 0.5625192 0.9116063 0.6170637 1.0540751 11004
## [8] {num_videos=[0,0.011)} => {weekday_is_sunday=0} 0.5593498 0.9064701 0.6170637 0.9961444 10942
## [9] {num_videos=[0,0.011)} => {weekday_is_saturday=0} 0.5561292 0.9012509 0.6170637 0.9881331 10879
## [10] {num_videos=[0,0.011)} => {data_channel_is_socmed=0} 0.5554136 0.9000911 0.6170637 0.9834990 10865
## [11] {num_videos=[0,0.011)} => {data_channel_is_lifestyle=0} 0.5697781 0.9233701 0.6170637 0.9837145 11146
## [12] {num_videos=[0,0.011)} => {kw_max_max=[0,1]} 0.6170637 1.0000000 0.6170637 1.0000000 12071
## [13] {kw_min_avg=[0.000277,0.542)} => {weekday_is_sunday=0} 0.6069931 0.9107924 0.6664451 1.0008943 11874
## [14] {kw_min_avg=[0.000277,0.542)} => {weekday_is_saturday=0} 0.6108782 0.9166219 0.6664451 1.0049859 11950
## [15] {kw_min_avg=[0.000277,0.542)} => {data_channel_is_socmed=0} 0.6148144 0.9225282 0.6664451 1.0080152 12027
## [16] {kw_min_avg=[0.000277,0.542)} => {data_channel_is_lifestyle=0} 0.6262141 0.9396334 0.6664451 1.0010406 12250
## [17] {kw_min_avg=[0.000277,0.542)} => {kw_max_max=[0,1]} 0.6664451 1.0000000 0.6664451 1.0000000 13037
## [18] {title_subjectivity=[0,0.451)} => {weekday_is_sunday=0} 0.6099581 0.9149605 0.6666496 1.0054748 11932
## [19] {title_subjectivity=[0,0.451)} => {weekday_is_saturday=0} 0.6091913 0.9138103 0.6666496 1.0019032 11917
## [20] {title_subjectivity=[0,0.451)} => {data_channel_is_socmed=0} 0.6059708 0.9089794 0.6666496 0.9932109 11854
#Sort by rules with high lift
rules_lift <- sort(association.rules, by = "lift", decreasing = TRUE)
#Show the support, lift, and confidence for the top 20 rules with high lift
inspect(rules_lift[1:20])
## lhs rhs support confidence coverage lift count
## [1] {num_videos=[0,0.011),
## weekday_is_sunday=0} => {data_channel_is_entertainment=0} 0.5106329 0.9129044 0.5593498 1.055576 9989
## [2] {num_videos=[0,0.011),
## kw_max_max=[0,1],
## weekday_is_sunday=0} => {data_channel_is_entertainment=0} 0.5106329 0.9129044 0.5593498 1.055576 9989
## [3] {num_videos=[0,0.011)} => {data_channel_is_entertainment=0} 0.5625192 0.9116063 0.6170637 1.054075 11004
## [4] {num_videos=[0,0.011),
## kw_max_max=[0,1]} => {data_channel_is_entertainment=0} 0.5625192 0.9116063 0.6170637 1.054075 11004
## [5] {num_videos=[0,0.011),
## weekday_is_saturday=0} => {data_channel_is_entertainment=0} 0.5066967 0.9111132 0.5561292 1.053505 9912
## [6] {num_videos=[0,0.011),
## kw_max_max=[0,1],
## weekday_is_saturday=0} => {data_channel_is_entertainment=0} 0.5066967 0.9111132 0.5561292 1.053505 9912
## [7] {num_videos=[0,0.011),
## data_channel_is_lifestyle=0} => {data_channel_is_entertainment=0} 0.5152336 0.9042706 0.5697781 1.045593 10079
## [8] {num_videos=[0,0.011),
## kw_max_max=[0,1],
## data_channel_is_lifestyle=0} => {data_channel_is_entertainment=0} 0.5152336 0.9042706 0.5697781 1.045593 10079
## [9] {num_videos=[0,0.011),
## data_channel_is_socmed=0} => {data_channel_is_entertainment=0} 0.5008690 0.9017948 0.5554136 1.042730 9798
## [10] {num_videos=[0,0.011),
## kw_max_max=[0,1],
## data_channel_is_socmed=0} => {data_channel_is_entertainment=0} 0.5008690 0.9017948 0.5554136 1.042730 9798
## [11] {min_positive_polarity=[0.1,1]} => {data_channel_is_socmed=0} 0.5544423 0.9516539 0.5826091 1.039840 10846
## [12] {kw_max_max=[0,1],
## min_positive_polarity=[0.1,1]} => {data_channel_is_socmed=0} 0.5544423 0.9516539 0.5826091 1.039840 10846
## [13] {min_positive_polarity=[0.1,1],
## weekday_is_sunday=0} => {data_channel_is_socmed=0} 0.5051120 0.9509191 0.5311829 1.039037 9881
## [14] {kw_max_max=[0,1],
## min_positive_polarity=[0.1,1],
## weekday_is_sunday=0} => {data_channel_is_socmed=0} 0.5051120 0.9509191 0.5311829 1.039037 9881
## [15] {min_positive_polarity=[0.1,1],
## weekday_is_saturday=0} => {data_channel_is_socmed=0} 0.5071567 0.9506516 0.5334833 1.038745 9921
## [16] {kw_max_max=[0,1],
## min_positive_polarity=[0.1,1],
## weekday_is_saturday=0} => {data_channel_is_socmed=0} 0.5071567 0.9506516 0.5334833 1.038745 9921
## [17] {min_positive_polarity=[0.1,1],
## data_channel_is_lifestyle=0} => {data_channel_is_socmed=0} 0.5157959 0.9482192 0.5439628 1.036087 10090
## [18] {kw_max_max=[0,1],
## min_positive_polarity=[0.1,1],
## data_channel_is_lifestyle=0} => {data_channel_is_socmed=0} 0.5157959 0.9482192 0.5439628 1.036087 10090
## [19] {abs_title_subjectivity=[0.467,1],
## data_channel_is_lifestyle=0,
## data_channel_is_entertainment=0} => {weekday_is_sunday=0} 0.5025560 0.9206780 0.5458542 1.011758 9831
## [20] {kw_max_max=[0,1],
## abs_title_subjectivity=[0.467,1],
## data_channel_is_lifestyle=0,
## data_channel_is_entertainment=0} => {weekday_is_sunday=0} 0.5025560 0.9206780 0.5458542 1.011758 9831
#lhs rhs support confidence coverage lift count
#[1] {num_videos=[0,0.011),
#weekday_is_sunday=0} => {data_channel_is_entertainment=0} 0.5106329 0.9129044 0.5593498 1.055576 9989
#[2] {num_videos=[0,0.011),
#kw_max_max=[0,1],
#weekday_is_sunday=0} => {data_channel_is_entertainment=0} 0.5106329 0.9129044 0.5593498 1.055576 9989
#[3] {num_videos=[0,0.011)} => {data_channel_is_entertainment=0} 0.5625192 0.9116063 0.6170637 1.054075 11004
#[4] {num_videos=[0,0.011),
#kw_max_max=[0,1]} => {data_channel_is_entertainment=0} 0.5625192 0.9116063 0.6170637 1.054075 11004
#[5] {num_videos=[0,0.011),
#weekday_is_saturday=0} => {data_channel_is_entertainment=0} 0.5066967 0.9111132 0.5561292 1.053505 9912
#[6] {num_videos=[0,0.011),
#kw_max_max=[0,1],
#weekday_is_saturday=0} => {data_channel_is_entertainment=0} 0.5066967 0.9111132 0.5561292 1.053505 9912
#[7] {num_videos=[0,0.011),
#data_channel_is_lifestyle=0} => {data_channel_is_entertainment=0} 0.5152336 0.9042706 0.5697781 1.045593 10079
#[8] {num_videos=[0,0.011),
#kw_max_max=[0,1],
#data_channel_is_lifestyle=0} => {data_channel_is_entertainment=0} 0.5152336 0.9042706 0.5697781 1.045593 10079
#[9] {num_videos=[0,0.011),
#data_channel_is_socmed=0} => {data_channel_is_entertainment=0} 0.5008690 0.9017948 0.5554136 1.042730 9798
#[10] {num_videos=[0,0.011),
#kw_max_max=[0,1],
#data_channel_is_socmed=0} => {data_channel_is_entertainment=0} 0.5008690 0.9017948 0.5554136 1.042730 9798
#[11] {min_positive_polarity=[0.1,1]} => {data_channel_is_socmed=0} 0.5544423 0.9516539 0.5826091 1.039840 10846
#[12] {kw_max_max=[0,1],
#min_positive_polarity=[0.1,1]} => {data_channel_is_socmed=0} 0.5544423 0.9516539 0.5826091 1.039840 10846
#[13] {min_positive_polarity=[0.1,1],
#weekday_is_sunday=0} => {data_channel_is_socmed=0} 0.5051120 0.9509191 0.5311829 1.039037 9881
#[14] {kw_max_max=[0,1],
#min_positive_polarity=[0.1,1],
#weekday_is_sunday=0} => {data_channel_is_socmed=0} 0.5051120 0.9509191 0.5311829 1.039037 9881
#[15] {min_positive_polarity=[0.1,1],
#weekday_is_saturday=0} => {data_channel_is_socmed=0} 0.5071567 0.9506516 0.5334833 1.038745 9921
#[16] {kw_max_max=[0,1],
#min_positive_polarity=[0.1,1],
#weekday_is_saturday=0} => {data_channel_is_socmed=0} 0.5071567 0.9506516 0.5334833 1.038745 9921
#[17] {min_positive_polarity=[0.1,1],
#data_channel_is_lifestyle=0} => {data_channel_is_socmed=0} 0.5157959 0.9482192 0.5439628 1.036087 10090
#[18] {kw_max_max=[0,1],
#min_positive_polarity=[0.1,1],
#data_channel_is_lifestyle=0} => {data_channel_is_socmed=0} 0.5157959 0.9482192 0.5439628 1.036087 10090
#[19] {abs_title_subjectivity=[0.467,1],
#data_channel_is_lifestyle=0,
#data_channel_is_entertainment=0} => {weekday_is_sunday=0} 0.5025560 0.9206780 0.5458542 1.011758 9831
#[20] {kw_max_max=[0,1],
#abs_title_subjectivity=[0.467,1],
#data_channel_is_lifestyle=0,
#data_channel_is_entertainment=0} => {weekday_is_sunday=0} 0.5025560 0.9206780 0.5458542 1.011758 9831
#Some interesting rules found that may help determine whether an article will be popular include:
#51% of the transactions show that 91% of popular articles with the number of videos between 0 and the bottom 1% of the
#distribution, maximum shares of the best keyword, and not published on Sunday (or Saturday), are also not entertainment.
#50-52% of the transactions show that 90% of popular articles with the number of videos between 0 and the bottom 1% of
#the distribution, maximum shares of the best keyword, and not social media (or lifestyle), are also not entertainment.
#51% of the transactions show that 95% of popular articles with maximum shares of the best keyword, with the top 90%
#of the minimum polarity of positive words, and not published on Saturday (or Sunday), are also not social media.
#52% of the transactions show that 95% of popular articles with maximum shares of the best keyword, with the top 90%
#of the minimum polarity of positive words, and not lifestyle, are also not social media.
#50% of transactions show that 92% of popular articles with maximum shares of the best keyword, absolute subjectivity
#level in the top 53% of the distribution, and not lifestyle, or entertainment, are also not published on Sunday.
#Plot the rules
plot(rules_lift, main = "Scatterplot for Association Rules")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
#This plot shows that rules with high lift have low support.
#Two-key plot
#The order shows the number of items in a rule
plot(rules_lift, method = "two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
#There are more items for rules with lower support.
#Interactive scatter-plot
#Hover over each rule to show the items and quality measures (support, confidence, lift)
plotly_arules(rules_lift)
## Warning: 'plotly_arules' is deprecated.
## Use 'plot' instead.
## See help("Deprecated")
## Warning: plot: Too many rules supplied. Only plotting the best 1000 rules using
## measure lift (change parameter max if needed)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
#Interactive graph-based visualization
#Filter the top 20 rules with highest lift
subset_rules <- head(association.rules, n=20, by="lift", decreasing = TRUE)
plot(subset_rules, method = "graph", engine = "htmlwidget")
#Individual rule representation - parallel coordinates plot
#Visualize what items cause other items in the set
#The right-hand side (RHS)/consequent is the item the set is proposed to have, and on the left-hand side are the
#most recent additions to the set.
plot(subset_rules, method="paracoord")
#Step 3 - Dimensionality Reduction
#The Random Forest feature selection method, which uses the Gini index to assign a score and rank features, may be used #to help select the features to be used in the analysis
#Feature selection with Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#Train Random Forest
set.seed(123)
rf_class <- randomForest(shares_cat ~., data=pop_norm, importance=TRUE)
print(rf_class)
##
## Call:
## randomForest(formula = shares_cat ~ ., data = pop_norm, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 33.71%
## Confusion matrix:
## 0 1 class.error
## 0 13401 6681 0.3326860
## 1 6682 12880 0.3415806
#Number of trees: 500
#No. of variables tried at each split: 6
#OOB estimate of error rate: 33.71%
#Confusion matrix:
# 0 1 class.error
#0 13401 6681 0.3326860
#1 6682 12880 0.3415806
#Plot the results that show the change in errors as the number of trees increases
plot(rf_class)
#The errors decrease until around 300 trees and then flatten to a rate of around 34%
#Evaluate variable importance according to mean decrease in accuracy and mean decrease in gini
#Mean Decrease in Accuracy
imp_class = importance(rf_class, type=1)
imp_class <- data.frame(predictors=rownames(imp_class),imp_class)
#Mean Decrease Gini
imp_gini = importance(rf_class, type=2)
imp_gini <- data.frame(predictors=rownames(imp_gini),imp_gini)
#Order the predictors by importance according to Mean Decrease in Accuracy
imp_class.sort <- arrange(imp_class, desc(MeanDecreaseAccuracy))
imp_class.sort$predictors <- factor(imp_class.sort$predictors,levels=imp_class.sort$predictors)
#Check the importance of the predictors sorted by Mean Decrease in Accuracy
print(imp_class.sort)
## predictors MeanDecreaseAccuracy
## 1 kw_max_avg 61.9394440
## 2 self_reference_min_shares 55.0765011
## 3 LDA_00 44.4673117
## 4 kw_min_avg 43.7175089
## 5 LDA_02 42.5568508
## 6 weekday_is_saturday 40.1188224
## 7 LDA_04 36.3968977
## 8 data_channel_is_socmed 35.5173088
## 9 data_channel_is_entertainment 35.2841318
## 10 LDA_01 35.2136206
## 11 LDA_03 33.3672259
## 12 n_non_stop_unique_tokens 30.5891593
## 13 num_hrefs 30.4651134
## 14 kw_max_max 30.4153065
## 15 num_imgs 30.3795300
## 16 kw_avg_max 29.0897650
## 17 weekday_is_sunday 27.9019685
## 18 average_token_length 27.0670120
## 19 global_rate_positive_words 26.3063715
## 20 global_subjectivity 24.6449979
## 21 data_channel_is_world 23.7709442
## 22 data_channel_is_tech 22.7403111
## 23 min_positive_polarity 22.5604469
## 24 num_keywords 20.0040417
## 25 num_self_hrefs 19.8729009
## 26 kw_max_min 19.3566300
## 27 global_rate_negative_words 17.1153821
## 28 max_positive_polarity 16.6575253
## 29 data_channel_is_bus 16.3356759
## 30 num_videos 16.1676228
## 31 avg_negative_polarity 15.5759165
## 32 avg_positive_polarity 15.5735591
## 33 max_negative_polarity 14.9521841
## 34 title_sentiment_polarity 12.1364370
## 35 title_subjectivity 11.6057996
## 36 abs_title_subjectivity 8.3462881
## 37 n_tokens_title 5.8054314
## 38 weekday_is_wednesday 3.0249403
## 39 weekday_is_friday 1.4720977
## 40 weekday_is_tuesday 1.4080527
## 41 data_channel_is_lifestyle 1.3611512
## 42 weekday_is_monday -0.1673653
## 43 weekday_is_thursday -0.5224005
# predictors MeanDecreaseAccuracy
#1 kw_max_avg 61.9394440
#2 self_reference_min_shares 55.0765011
#3 LDA_00 44.4673117
#4 kw_min_avg 43.7175089
#5 LDA_02 42.5568508
#6 weekday_is_saturday 40.1188224
#7 LDA_04 36.3968977
#8 data_channel_is_socmed 35.5173088
#9 data_channel_is_entertainment 35.2841318
#10 LDA_01 35.2136206
#11 LDA_03 33.3672259
#12 n_non_stop_unique_tokens 30.5891593
#13 num_hrefs 30.4651134
#14 kw_max_max 30.4153065
#15 num_imgs 30.3795300
#16 kw_avg_max 29.0897650
#17 weekday_is_sunday 27.9019685
#18 average_token_length 27.0670120
#19 global_rate_positive_words 26.3063715
#20 global_subjectivity 24.6449979
#21 data_channel_is_world 23.7709442
#22 data_channel_is_tech 22.7403111
#23 min_positive_polarity 22.5604469
#24 num_keywords 20.0040417
#25 num_self_hrefs 19.8729009
#26 kw_max_min 19.3566300
#27 global_rate_negative_words 17.1153821
#28 max_positive_polarity 16.6575253
#29 data_channel_is_bus 16.3356759
#30 num_videos 16.1676228
#31 avg_negative_polarity 15.5759165
#32 avg_positive_polarity 15.5735591
#33 max_negative_polarity 14.9521841
#34 title_sentiment_polarity 12.1364370
#35 title_subjectivity 11.6057996
#36 abs_title_subjectivity 8.3462881
#37 n_tokens_title 5.8054314
#38 weekday_is_wednesday 3.0249403
#39 weekday_is_friday 1.4720977
#40 weekday_is_tuesday 1.4080527
#41 data_channel_is_lifestyle 1.3611512
#42 weekday_is_monday -0.1673653
#43 weekday_is_thursday -0.5224005
#The top predictors in terms of Mean Decrease in Accuracy include maximum shares of the average keyword, minimum shares
#of referenced articles in Mashable, closeness to LDA topic 0, minimum shares of the average keyword, closeness to LDA
#topic 2, whether the article was published on Saturday, closeness to LDA topic 4, whether the data channel is social
#media, whether the data channel is entertainment, closeness to LDA topic 1, and so on.
#Note about LDA topics: natural language features were extracted from the Mashable news service using the Latent
#Dirichlet Allocation algorithm which were applied to Mashable texts before publication to identify the top five
#relevant topics and measure the closeness of the current article to those topics.
#Order the predictors by importance according to Mean Decrease in Gini
imp_gini.sort <- arrange(imp_gini, desc(imp_gini$MeanDecreaseGini))
imp_gini.sort$predictors <- factor(imp_gini.sort$predictors,levels=imp_gini.sort$predictors)
#Check the predictors sorted by Mean Decrease in Gini
print(imp_gini.sort)
## predictors MeanDecreaseGini
## 1 kw_max_avg 1153.92287
## 2 self_reference_min_shares 946.62975
## 3 LDA_02 907.79724
## 4 kw_avg_max 814.85549
## 5 n_non_stop_unique_tokens 806.63610
## 6 LDA_01 803.72210
## 7 LDA_04 797.00683
## 8 LDA_00 794.83394
## 9 global_subjectivity 774.46316
## 10 average_token_length 756.70179
## 11 global_rate_positive_words 754.00867
## 12 kw_max_min 753.72227
## 13 LDA_03 751.77275
## 14 kw_min_avg 728.77415
## 15 avg_positive_polarity 723.53471
## 16 global_rate_negative_words 716.73448
## 17 avg_negative_polarity 694.98664
## 18 num_hrefs 609.62104
## 19 n_tokens_title 445.94421
## 20 title_sentiment_polarity 442.60076
## 21 max_negative_polarity 411.15707
## 22 num_imgs 410.05621
## 23 num_self_hrefs 393.12895
## 24 title_subjectivity 379.42032
## 25 min_positive_polarity 363.90795
## 26 abs_title_subjectivity 329.79344
## 27 max_positive_polarity 306.06629
## 28 num_keywords 272.23733
## 29 num_videos 235.01009
## 30 kw_max_max 212.02053
## 31 data_channel_is_entertainment 198.25991
## 32 weekday_is_saturday 162.87000
## 33 data_channel_is_world 155.49323
## 34 data_channel_is_socmed 116.90543
## 35 weekday_is_sunday 108.09454
## 36 data_channel_is_tech 104.43794
## 37 weekday_is_wednesday 81.20075
## 38 weekday_is_tuesday 79.40986
## 39 weekday_is_thursday 74.68520
## 40 weekday_is_monday 70.99114
## 41 weekday_is_friday 70.29596
## 42 data_channel_is_bus 50.91741
## 43 data_channel_is_lifestyle 37.07451
# predictors MeanDecreaseGini
#1 kw_max_avg 1153.92287
#2 self_reference_min_shares 946.62975
#3 LDA_02 907.79724
#4 kw_avg_max 814.85549
#5 n_non_stop_unique_tokens 806.63610
#6 LDA_01 803.72210
#7 LDA_04 797.00683
#8 LDA_00 794.83394
#9 global_subjectivity 774.46316
#10 average_token_length 756.70179
#11 global_rate_positive_words 754.00867
#12 kw_max_min 753.72227
#13 LDA_03 751.77275
#14 kw_min_avg 728.77415
#15 avg_positive_polarity 723.53471
#16 global_rate_negative_words 716.73448
#17 avg_negative_polarity 694.98664
#18 num_hrefs 609.62104
#19 n_tokens_title 445.94421
#20 title_sentiment_polarity 442.60076
#21 max_negative_polarity 411.15707
#22 num_imgs 410.05621
#23 num_self_hrefs 393.12895
#24 title_subjectivity 379.42032
#25 min_positive_polarity 363.90795
#26 abs_title_subjectivity 329.79344
#27 max_positive_polarity 306.06629
#28 num_keywords 272.23733
#29 num_videos 235.01009
#30 kw_max_max 212.02053
#31 data_channel_is_entertainment 198.25991
#32 weekday_is_saturday 162.87000
#33 data_channel_is_world 155.49323
#34 data_channel_is_socmed 116.90543
#35 weekday_is_sunday 108.09454
#36 data_channel_is_tech 104.43794
#37 weekday_is_wednesday 81.20075
#38 weekday_is_tuesday 79.40986
#39 weekday_is_thursday 74.68520
#40 weekday_is_monday 70.99114
#41 weekday_is_friday 70.29596
#42 data_channel_is_bus 50.91741
#43 data_channel_is_lifestyle 37.07451
#The top predictors in terms of Mean Decrease in Gini include maximum shares of the average keyword, minimum shares of
#referenced articles in Mashable, closeness to LDA topic 2, average shares of the best keyword, rate of unique non-stop
#words in the content, closeness to LDA topic 1, closeness to LDA topic 4, closeness to LDA topic 0, text subjectivity,
#average length of the words in the content, and so on.
#Plot Important Variables
varImpPlot(rf_class, type=1, main = "Variable Importance (Mean Decrease in Accuracy)")
varImpPlot(rf_class, type=2, main = "Variable Importance (Mean Decrease in Gini)")
varImpPlot(rf_class, main = "Variable Importance")
#Select the top predictors
imp_class.top <- imp_class.sort[1:20,]
imp_gini.top <- imp_gini.sort[1:20,]
#Print the top predictors according to Mean Decrease in Accuarcy
print(imp_class.top)
## predictors MeanDecreaseAccuracy
## 1 kw_max_avg 61.93944
## 2 self_reference_min_shares 55.07650
## 3 LDA_00 44.46731
## 4 kw_min_avg 43.71751
## 5 LDA_02 42.55685
## 6 weekday_is_saturday 40.11882
## 7 LDA_04 36.39690
## 8 data_channel_is_socmed 35.51731
## 9 data_channel_is_entertainment 35.28413
## 10 LDA_01 35.21362
## 11 LDA_03 33.36723
## 12 n_non_stop_unique_tokens 30.58916
## 13 num_hrefs 30.46511
## 14 kw_max_max 30.41531
## 15 num_imgs 30.37953
## 16 kw_avg_max 29.08977
## 17 weekday_is_sunday 27.90197
## 18 average_token_length 27.06701
## 19 global_rate_positive_words 26.30637
## 20 global_subjectivity 24.64500
# predictors MeanDecreaseAccuracy
#1 kw_max_avg 61.93944
#2 self_reference_min_shares 55.07650
#3 LDA_00 44.46731
#4 kw_min_avg 43.71751
#5 LDA_02 42.55685
#6 weekday_is_saturday 40.11882
#7 LDA_04 36.39690
#8 data_channel_is_socmed 35.51731
#9 data_channel_is_entertainment 35.28413
#10 LDA_01 35.21362
#11 LDA_03 33.36723
#12 n_non_stop_unique_tokens 30.58916
#13 num_hrefs 30.46511
#14 kw_max_max 30.41531
#15 num_imgs 30.37953
#16 kw_avg_max 29.08977
#17 weekday_is_sunday 27.90197
#18 average_token_length 27.06701
#19 global_rate_positive_words 26.30637
#20 global_subjectivity 24.64500
#Print the top predictors in terms of Mean Decrease in Gini
print(imp_gini.top)
## predictors MeanDecreaseGini
## 1 kw_max_avg 1153.9229
## 2 self_reference_min_shares 946.6298
## 3 LDA_02 907.7972
## 4 kw_avg_max 814.8555
## 5 n_non_stop_unique_tokens 806.6361
## 6 LDA_01 803.7221
## 7 LDA_04 797.0068
## 8 LDA_00 794.8339
## 9 global_subjectivity 774.4632
## 10 average_token_length 756.7018
## 11 global_rate_positive_words 754.0087
## 12 kw_max_min 753.7223
## 13 LDA_03 751.7727
## 14 kw_min_avg 728.7742
## 15 avg_positive_polarity 723.5347
## 16 global_rate_negative_words 716.7345
## 17 avg_negative_polarity 694.9866
## 18 num_hrefs 609.6210
## 19 n_tokens_title 445.9442
## 20 title_sentiment_polarity 442.6008
# predictors MeanDecreaseGini
#1 kw_max_avg 1153.9229
#2 self_reference_min_shares 946.6298
#3 LDA_02 907.7972
#4 kw_avg_max 814.8555
#5 n_non_stop_unique_tokens 806.6361
#6 LDA_01 803.7221
#7 LDA_04 797.0068
#8 LDA_00 794.8339
#9 global_subjectivity 774.4632
#10 average_token_length 756.7018
#11 global_rate_positive_words 754.0087
#12 kw_max_min 753.7223
#13 LDA_03 751.7727
#14 kw_min_avg 728.7742
#15 avg_positive_polarity 723.5347
#16 global_rate_negative_words 716.7345
#17 avg_negative_polarity 694.9866
#18 num_hrefs 609.6210
#19 n_tokens_title 445.9442
#20 title_sentiment_polarity 442.6008
#Subset the data with the required independent and dependent variables
#The top variables in terms of the mean decrease in accuracy and mean decrease in gini are kept for the model.
pop_norm_subset <- pop_norm[,c("kw_max_avg", "self_reference_min_shares", "LDA_00", "kw_min_avg", "LDA_02",
"weekday_is_saturday", "LDA_04", "data_channel_is_socmed",
"data_channel_is_entertainment", "LDA_01", "LDA_03", "n_non_stop_unique_tokens",
"num_hrefs", "kw_max_max", "num_imgs", "kw_avg_max", "weekday_is_sunday",
"average_token_length", "global_rate_positive_words", "global_subjectivity",
"kw_max_min", "avg_positive_polarity", "global_rate_negative_words",
"avg_negative_polarity", "n_tokens_title", "title_sentiment_polarity", "shares_cat")]
#Structure of the dataset to be used for predictive analysis
str(pop_norm_subset)
## 'data.frame': 39644 obs. of 27 variables:
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 0.000588 0 0.001089 0 0.000646 ...
## $ LDA_00 : num 0.5397 0.8627 0.2349 0.0308 0.0309 ...
## $ kw_min_avg : num 0.000277 0.000277 0.000277 0.000277 0.000277 ...
## $ LDA_02 : num 0.0435 0.0545 0.0363 0.5377 0.0311 ...
## $ weekday_is_saturday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ LDA_04 : num 0.0433 0.0539 0.7358 0.0308 0.955 ...
## $ data_channel_is_socmed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ data_channel_is_entertainment: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 1 ...
## $ LDA_01 : num 0.4085 0.054 0.036 0.4528 0.0311 ...
## $ LDA_03 : num 0.0445 0.0541 0.036 0.0312 0.0308 ...
## $ n_non_stop_unique_tokens : num 0.001254 0.001218 0.001021 0.001024 0.000832 ...
## $ num_hrefs : num 0.01316 0.00987 0.00987 0.02961 0.0625 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num_imgs : num 0.00781 0.00781 0.00781 0.00781 0.15625 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ average_token_length : num 0.582 0.611 0.546 0.548 0.582 ...
## $ global_rate_positive_words : num 0.294 0.277 0.366 0.266 0.48 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ global_rate_negative_words : num 0.0741 0.0848 0.0513 0.112 0.0656 ...
## $ avg_negative_polarity : num 0.65 0.881 0.533 0.63 0.78 ...
## $ n_tokens_title : num 0.476 0.333 0.333 0.333 0.524 ...
## $ title_sentiment_polarity : num 0.406 0.5 0.5 0.5 0.568 ...
## $ shares_cat : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 1 ...
#'data.frame': 39644 obs. of 27 variables:
#$ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
#$ self_reference_min_shares : num 0.000588 0 0.001089 0 0.000646 ...
#$ LDA_00 : num 0.5397 0.8627 0.2349 0.0308 0.0309 ...
#$ kw_min_avg : num 0.000277 0.000277 0.000277 0.000277 0.000277 ...
#$ LDA_02 : num 0.0435 0.0545 0.0363 0.5377 0.0311 ...
#$ weekday_is_saturday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#$ LDA_04 : num 0.0433 0.0539 0.7358 0.0308 0.955 ...
#$ data_channel_is_socmed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#$ data_channel_is_entertainment: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 1 ...
#$ LDA_01 : num 0.4085 0.054 0.036 0.4528 0.0311 ...
#$ LDA_03 : num 0.0445 0.0541 0.036 0.0312 0.0308 ...
#$ n_non_stop_unique_tokens : num 0.001254 0.001218 0.001021 0.001024 0.000832 ...
#$ num_hrefs : num 0.01316 0.00987 0.00987 0.02961 0.0625 ...
#$ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
#$ num_imgs : num 0.00781 0.00781 0.00781 0.00781 0.15625 ...
#$ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
#$ weekday_is_sunday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#$ average_token_length : num 0.582 0.611 0.546 0.548 0.582 ...
#$ global_rate_positive_words : num 0.294 0.277 0.366 0.266 0.48 ...
#$ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
#$ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
#$ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
#$ global_rate_negative_words : num 0.0741 0.0848 0.0513 0.112 0.0656 ...
#$ avg_negative_polarity : num 0.65 0.881 0.533 0.63 0.78 ...
#$ n_tokens_title : num 0.476 0.333 0.333 0.333 0.524 ...
#$ title_sentiment_polarity : num 0.406 0.5 0.5 0.5 0.568 ...
#$ shares_cat : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 1 ...
#Check the five number summary for numeric variables and levels and frequency tables for factor variables
summary(pop_norm_subset)
## kw_max_avg self_reference_min_shares LDA_00
## Min. :0.00000 Min. :0.0000000 Min. :0.00000
## 1st Qu.:0.01194 1st Qu.:0.0007577 1st Qu.:0.02702
## Median :0.01460 Median :0.0014230 Median :0.03602
## Mean :0.01896 Mean :0.0047418 Mean :0.19914
## 3rd Qu.:0.02017 3rd Qu.:0.0030831 3rd Qu.:0.25993
## Max. :1.00000 Max. :1.0000000 Max. :1.00000
## kw_min_avg LDA_02 weekday_is_saturday LDA_04
## Min. :0.0000000 Min. :0.00000 0:37191 Min. :0.00000
## 1st Qu.:0.0002767 1st Qu.:0.03106 1: 2453 1st Qu.:0.03082
## Median :0.2835153 Median :0.04348 Median :0.04393
## Mean :0.3093897 Mean :0.23513 Mean :0.25241
## 3rd Qu.:0.5693853 3rd Qu.:0.36328 3rd Qu.:0.43140
## Max. :1.0000000 Max. :1.00000 Max. :1.00000
## data_channel_is_socmed data_channel_is_entertainment LDA_01
## 0:37321 0:32587 Min. :0.00000
## 1: 2323 1: 7057 1st Qu.:0.02701
## Median :0.03601
## Mean :0.15255
## 3rd Qu.:0.16289
## Max. :1.00000
## LDA_03 n_non_stop_unique_tokens num_hrefs kw_max_max
## Min. :0.00000 Min. :0.0000000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.03084 1st Qu.:0.0009627 1st Qu.:0.01316 1st Qu.:1.0000
## Median :0.04317 Median :0.0010623 Median :0.02632 Median :1.0000
## Mean :0.24151 Mean :0.0010603 Mean :0.03580 Mean :0.8921
## 3rd Qu.:0.40556 3rd Qu.:0.0011610 3rd Qu.:0.04605 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000000 Max. :1.00000 Max. :1.0000
## num_imgs kw_avg_max weekday_is_sunday average_token_length
## Min. :0.000000 Min. :0.0000 0:36907 Min. :0.0000
## 1st Qu.:0.007812 1st Qu.:0.2050 1: 2737 1st Qu.:0.5569
## Median :0.007812 Median :0.2900 Median :0.5800
## Mean :0.035501 Mean :0.3075 Mean :0.5656
## 3rd Qu.:0.031250 3rd Qu.:0.3925 3rd Qu.:0.6037
## Max. :1.000000 Max. :1.0000 Max. :1.0000
## global_rate_positive_words global_subjectivity kw_max_min
## Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.1825 1st Qu.:0.3962 1st Qu.:0.001491
## Median :0.2510 Median :0.4535 Median :0.002212
## Mean :0.2548 Mean :0.4434 Mean :0.003867
## 3rd Qu.:0.3234 3rd Qu.:0.5083 3rd Qu.:0.003351
## Max. :1.0000 Max. :1.0000 Max. :1.000000
## avg_positive_polarity global_rate_negative_words avg_negative_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05199 1st Qu.:0.6716
## Median :0.3588 Median :0.08294 Median :0.7467
## Mean :0.3538 Mean :0.08983 Mean :0.7405
## 3rd Qu.:0.4114 3rd Qu.:0.11755 3rd Qu.:0.8131
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## n_tokens_title title_sentiment_polarity shares_cat
## Min. :0.0000 Min. :0.0000 0:20082
## 1st Qu.:0.3333 1st Qu.:0.5000 1:19562
## Median :0.3810 Median :0.5000
## Mean :0.3999 Mean :0.5357
## 3rd Qu.:0.4762 3rd Qu.:0.5750
## Max. :1.0000 Max. :1.0000
#kw_max_avg self_reference_min_shares LDA_00 kw_min_avg LDA_02 weekday_is_saturday
#Min. :0.00000 Min. :0.0000000 Min. :0.00000 Min. :0.0000000 Min. :0.00000 0:37191
#1st Qu.:0.01194 1st Qu.:0.0007577 1st Qu.:0.02702 1st Qu.:0.0002767 1st Qu.:0.03106 1: 2453
#Median :0.01460 Median :0.0014230 Median :0.03602 Median :0.2835153 Median :0.04348
#Mean :0.01896 Mean :0.0047418 Mean :0.19914 Mean :0.3093897 Mean :0.23513
#3rd Qu.:0.02017 3rd Qu.:0.0030831 3rd Qu.:0.25993 3rd Qu.:0.5693853 3rd Qu.:0.36328
#Max. :1.00000 Max. :1.0000000 Max. :1.00000 Max. :1.0000000 Max. :1.00000
#LDA_04 data_channel_is_socmed data_channel_is_entertainment LDA_01 LDA_03
#Min. :0.00000 0:37321 0:32587 Min. :0.00000 Min. :0.00000
#1st Qu.:0.03082 1: 2323 1: 7057 1st Qu.:0.02701 1st Qu.:0.03084
#Median :0.04393 Median :0.03601 Median :0.04317
#Mean :0.25241 Mean :0.15255 Mean :0.24151
#3rd Qu.:0.43140 3rd Qu.:0.16289 3rd Qu.:0.40556
#Max. :1.00000 Max. :1.00000 Max. :1.00000
#n_non_stop_unique_tokens num_hrefs kw_max_max num_imgs kw_avg_max weekday_is_sunday
#Min. :0.0000000 Min. :0.00000 Min. :0.0000 Min. :0.000000 Min. :0.0000 0:36907
#1st Qu.:0.0009627 1st Qu.:0.01316 1st Qu.:1.0000 1st Qu.:0.007812 1st Qu.:0.2050 1: 2737
#Median :0.0010623 Median :0.02632 Median :1.0000 Median :0.007812 Median :0.2900
#Mean :0.0010603 Mean :0.03580 Mean :0.8921 Mean :0.035501 Mean :0.3075
#3rd Qu.:0.0011610 3rd Qu.:0.04605 3rd Qu.:1.0000 3rd Qu.:0.031250 3rd Qu.:0.3925
#Max. :1.0000000 Max. :1.00000 Max. :1.0000 Max. :1.000000 Max. :1.0000
#average_token_length global_rate_positive_words global_subjectivity kw_max_min avg_positive_polarity
#Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.0000
#1st Qu.:0.5569 1st Qu.:0.1825 1st Qu.:0.3962 1st Qu.:0.001491 1st Qu.:0.3062
#Median :0.5800 Median :0.2510 Median :0.4535 Median :0.002212 Median :0.3588
#Mean :0.5656 Mean :0.2548 Mean :0.4434 Mean :0.003867 Mean :0.3538
#3rd Qu.:0.6037 3rd Qu.:0.3234 3rd Qu.:0.5083 3rd Qu.:0.003351 3rd Qu.:0.4114
#Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000000 Max. :1.0000
#global_rate_negative_words avg_negative_polarity n_tokens_title title_sentiment_polarity shares_cat
#Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000 0:20082
#1st Qu.:0.05199 1st Qu.:0.6716 1st Qu.:0.3333 1st Qu.:0.5000 1:19562
#Median :0.08294 Median :0.7467 Median :0.3810 Median :0.5000
#Mean :0.08983 Mean :0.7405 Mean :0.3999 Mean :0.5357
#3rd Qu.:0.11755 3rd Qu.:0.8131 3rd Qu.:0.4762 3rd Qu.:0.5750
#Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
#Step 4 - Experimental Design
#In this step, the processed data may be split into 70% training, 15% validation, and 15% test sets in order to tune the #parameters of the models that will be studied; as well as split into training and test sets using 10-fold #cross-validation to improve the performance of the models, and these results may be used to compare each model.
#First, create train, test, and validation sets for tuning parameters
#Index for 70% training set
set.seed(123)
train_index <- sample(1:nrow(pop_norm_subset), 0.7 * nrow(pop_norm_subset))
#Subset the data using the training index to create the training dataset
train.set <- pop_norm_subset[train_index,]
#The remaining data will be used for 15% validation and 15% test datasets
test_valid.set <- pop_norm_subset[-train_index,]
#Index for 15% validation and 15% testing dataset
set.seed(123)
testvalid_index <- sample(1:nrow(test_valid.set), 0.5 * nrow(test_valid.set))
#Validation set
valid.set <- test_valid.set[testvalid_index,]
#Test set
test.set <- test_valid.set[-testvalid_index,]
#Check that the shares categories are balanced
summary(train.set$shares_cat)
## 0 1
## 14051 13699
#0 1
#14051 13699
summary(valid.set$shares_cat)
## 0 1
## 3042 2905
#0 1
#3042 2905
summary(test.set$shares_cat)
## 0 1
## 2989 2958
#0 1
#2989 2958
#The popular and unpopular levels of shares are fairly balanced.
#Second, create train and test sets using 10-fold cross-validation
set.seed(123)
folds <- createFolds(pop_norm_subset$shares_cat)
str(folds)
## List of 10
## $ Fold01: int [1:3964] 4 6 11 12 17 21 29 46 49 52 ...
## $ Fold02: int [1:3964] 2 14 16 30 37 38 56 68 72 79 ...
## $ Fold03: int [1:3966] 41 43 48 51 63 80 92 97 103 104 ...
## $ Fold04: int [1:3964] 34 36 40 50 74 76 83 84 90 118 ...
## $ Fold05: int [1:3964] 22 32 57 69 91 96 108 123 148 154 ...
## $ Fold06: int [1:3964] 1 7 13 24 27 31 35 54 58 62 ...
## $ Fold07: int [1:3964] 10 18 19 23 28 47 55 93 99 107 ...
## $ Fold08: int [1:3965] 3 5 8 9 25 59 65 77 100 116 ...
## $ Fold09: int [1:3964] 15 20 33 39 44 53 67 86 89 112 ...
## $ Fold10: int [1:3965] 26 42 45 78 94 98 102 113 124 131 ...
for (f in folds){
train <- pop_norm_subset[-f,]
test <- pop_norm_subset[f,]
}
#Check that the shares categories are balanced
summary(train$shares_cat)
## 0 1
## 18073 17606
#0 1
#18073 17606
summary(test$shares_cat)
## 0 1
## 2009 1956
#0 1
#2009 1956
#The popular and unpopular levels of shares are fairly balanced.
#Step 5 - Prediction & Step 6 - Performance Evaluation
#Prediction: classification models - Logistic Regression, K-Nearest Neighbours, Decision Trees, Random Forest, and #Gradient Boosting Machine - are tested to find the algorithm with the best accuracy.
#Performance Evaluation: at this stage, the machine learning algorithms that have been run are evaluated using various #methods. Several metrics are computed including Accuracy, Precision, Recall/Sensitivity, Specificity, and F1 score, #depending on the Confusion Matrix.
#Note the results found are similar to what has been found in the literature.
#Logistic Regression
#For logistic regression, there are no tuning parameters
#Fit the logistic regression model on the training set using 10-fold cross-validation
set.seed(123)
glm_model <- glm(shares_cat ~ ., family = binomial(link='logit'), data = train)
#Print the output of the model
summary(glm_model)
##
## Call:
## glm(formula = shares_cat ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7418 -1.0827 -0.6992 1.1054 1.9157
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 601.44697 109.72008 5.482 4.21e-08 ***
## kw_max_avg 7.80835 1.00111 7.800 6.20e-15 ***
## self_reference_min_shares 6.37848 1.00096 6.372 1.86e-10 ***
## LDA_00 -557.22283 101.70299 -5.479 4.28e-08 ***
## kw_min_avg 0.41822 0.03977 10.515 < 2e-16 ***
## LDA_02 -554.06224 100.93737 -5.489 4.04e-08 ***
## weekday_is_saturday1 0.93981 0.05003 18.785 < 2e-16 ***
## LDA_04 -557.20049 101.72319 -5.478 4.31e-08 ***
## data_channel_is_socmed1 0.81527 0.05225 15.603 < 2e-16 ***
## data_channel_is_entertainment1 -0.56514 0.04034 -14.010 < 2e-16 ***
## LDA_01 -556.97269 101.58393 -5.483 4.18e-08 ***
## LDA_03 -557.03894 101.64952 -5.480 4.25e-08 ***
## n_non_stop_unique_tokens -591.38563 82.41525 -7.176 7.19e-13 ***
## num_hrefs 2.79933 0.37072 7.551 4.32e-14 ***
## kw_max_max -0.25513 0.05708 -4.469 7.85e-06 ***
## num_imgs 0.44273 0.20944 2.114 0.03453 *
## kw_avg_max -0.23634 0.10530 -2.245 0.02480 *
## weekday_is_sunday1 0.73161 0.04520 16.187 < 2e-16 ***
## average_token_length -0.63080 0.19441 -3.245 0.00118 **
## global_rate_positive_words 0.10389 0.11911 0.872 0.38311
## global_subjectivity 1.47690 0.15444 9.563 < 2e-16 ***
## kw_max_min -2.52485 1.44210 -1.751 0.07998 .
## avg_positive_polarity -0.27197 0.14544 -1.870 0.06149 .
## global_rate_negative_words -0.10257 0.20993 -0.489 0.62514
## avg_negative_polarity -0.03974 0.10384 -0.383 0.70194
## n_tokens_title -0.11805 0.11385 -1.037 0.29980
## title_sentiment_polarity 0.43254 0.08683 4.981 6.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 49455 on 35678 degrees of freedom
## Residual deviance: 45911 on 35652 degrees of freedom
## AIC: 45965
##
## Number of Fisher Scoring iterations: 8
#Deviance Residuals:
# Min 1Q Median 3Q Max
# 3.7418 -1.0827 -0.6992 1.1054 1.9157
#Coefficients:
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) 601.44697 109.72008 5.482 4.21e-08 ***
# kw_max_avg 7.80835 1.00111 7.800 6.20e-15 ***
# self_reference_min_shares 6.37848 1.00096 6.372 1.86e-10 ***
# LDA_00 -557.22283 101.70299 -5.479 4.28e-08 ***
# kw_min_avg 0.41822 0.03977 10.515 < 2e-16 ***
# LDA_02 -554.06224 100.93737 -5.489 4.04e-08 ***
# weekday_is_saturday1 0.93981 0.05003 18.785 < 2e-16 ***
# LDA_04 -557.20049 101.72319 -5.478 4.31e-08 ***
# data_channel_is_socmed1 0.81527 0.05225 15.603 < 2e-16 ***
# data_channel_is_entertainment1 -0.56514 0.04034 -14.010 < 2e-16 ***
# LDA_01 -556.97269 101.58393 -5.483 4.18e-08 ***
# LDA_03 -557.03894 101.64952 -5.480 4.25e-08 ***
# n_non_stop_unique_tokens -591.38563 82.41525 -7.176 7.19e-13 ***
# num_hrefs 2.79933 0.37072 7.551 4.32e-14 ***
# kw_max_max -0.25513 0.05708 -4.469 7.85e-06 ***
# num_imgs 0.44273 0.20944 2.114 0.03453 *
# kw_avg_max -0.23634 0.10530 -2.245 0.02480 *
# weekday_is_sunday1 0.73161 0.04520 16.187 < 2e-16 ***
# average_token_length -0.63080 0.19441 -3.245 0.00118 **
# global_rate_positive_words 0.10389 0.11911 0.872 0.38311
#global_subjectivity 1.47690 0.15444 9.563 < 2e-16 ***
# kw_max_min -2.52485 1.44210 -1.751 0.07998 .
#avg_positive_polarity -0.27197 0.14544 -1.870 0.06149 .
#global_rate_negative_words -0.10257 0.20993 -0.489 0.62514
#avg_negative_polarity -0.03974 0.10384 -0.383 0.70194
#n_tokens_title -0.11805 0.11385 -1.037 0.29980
#title_sentiment_polarity 0.43254 0.08683 4.981 6.32e-07 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#(Dispersion parameter for binomial family taken to be 1)
#Null deviance: 49455 on 35678 degrees of freedom
#Residual deviance: 45911 on 35652 degrees of freedom
#AIC: 45965
#Number of Fisher Scoring iterations: 8
#The coefficients of the predictors indicate that every one unit change in the independent variable produces a
#specific unit change in the log odds of the dependent variable i.e. the log odds of an article being popular.
#The largest impact is from the rate of unique non-stop words in the content, followed by closeness to LDA topics 0,
#1, 2, 3, and 4 which all have a negative relationship with the log odds of an article being popular.
#Most of the estimates are statistically significant and the model produces the following relationship between the
#log odds and the predictors (only including the independent variables that have a significant effect on the dependent
#variable):
#logit(p) = 601.45 + 7.81*(maximum shares of average keyword) + 6.38*(minimum shares of referenced articles in Mashable)
# - 557.22*(closeness to LDA topic 0) + 0.42*(minimum shares of the average keyword)
# - 554.06*(closeness to LDA topic 2) + 0.94*(article published on Saturday)
# - 557.2*(closeness to LDA topic 4) + 0.82*(data channel is social media)
# - 0.56*(data channel is entertainment) - 556.97*(closeness to LDA topic 1)
# - 557.04*(closeness to LDA topic 3) - 591.39*(rate of unique non-stop words in the content)
# + 2.8*(number of links) - 0.26*(maximum shares of best keyword) + 0.44*(number of images)
# - 0.24*(average shares of the best keyword) + 0.73*(article published on Sunday)
# - 0.63*(average length of the words in the content) + 1.48*(text subjectivity) + 0.43*(title polarity)
#Furthermore, the residual deviance is found to be better than the null deviance.
#Plot the results
plot(glm_model)
## Warning: not plotting observations with leverage one:
## 27930
## Warning: not plotting observations with leverage one:
## 27930
#Analyze the table of deviance
anova(glm_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: shares_cat
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 35678 49455
## kw_max_avg 1 265.89 35677 49190 < 2.2e-16 ***
## self_reference_min_shares 1 108.83 35676 49081 < 2.2e-16 ***
## LDA_00 1 166.84 35675 48914 < 2.2e-16 ***
## kw_min_avg 1 230.89 35674 48683 < 2.2e-16 ***
## LDA_02 1 549.65 35673 48133 < 2.2e-16 ***
## weekday_is_saturday 1 397.77 35672 47736 < 2.2e-16 ***
## LDA_04 1 247.32 35671 47488 < 2.2e-16 ***
## data_channel_is_socmed 1 384.11 35670 47104 < 2.2e-16 ***
## data_channel_is_entertainment 1 385.15 35669 46719 < 2.2e-16 ***
## LDA_01 1 40.67 35668 46678 1.801e-10 ***
## LDA_03 1 1.60 35667 46677 0.2058005
## n_non_stop_unique_tokens 1 108.87 35666 46568 < 2.2e-16 ***
## num_hrefs 1 147.18 35665 46421 < 2.2e-16 ***
## kw_max_max 1 56.83 35664 46364 4.761e-14 ***
## num_imgs 1 11.72 35663 46352 0.0006167 ***
## kw_avg_max 1 12.63 35662 46340 0.0003796 ***
## weekday_is_sunday 1 272.29 35661 46067 < 2.2e-16 ***
## average_token_length 1 0.00 35660 46067 0.9597446
## global_rate_positive_words 1 23.08 35659 46044 1.551e-06 ***
## global_subjectivity 1 101.37 35658 45943 < 2.2e-16 ***
## kw_max_min 1 2.95 35657 45940 0.0856799 .
## avg_positive_polarity 1 2.22 35656 45938 0.1360054
## global_rate_negative_words 1 1.32 35655 45936 0.2507089
## avg_negative_polarity 1 0.01 35654 45936 0.9422567
## n_tokens_title 1 0.89 35653 45935 0.3452956
## title_sentiment_polarity 1 24.88 35652 45911 6.099e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Analysis of Deviance Table
#Model: binomial, link: logit
#Response: shares_cat
#Terms added sequentially (first to last)
# Df Deviance Resid. Df Resid. Dev Pr(>Chi)
#NULL 35678 49455
#kw_max_avg 1 265.89 35677 49190 < 2.2e-16 ***
# self_reference_min_shares 1 108.83 35676 49081 < 2.2e-16 ***
# LDA_00 1 166.84 35675 48914 < 2.2e-16 ***
# kw_min_avg 1 230.89 35674 48683 < 2.2e-16 ***
# LDA_02 1 549.65 35673 48133 < 2.2e-16 ***
# weekday_is_saturday 1 397.77 35672 47736 < 2.2e-16 ***
# LDA_04 1 247.32 35671 47488 < 2.2e-16 ***
# data_channel_is_socmed 1 384.11 35670 47104 < 2.2e-16 ***
# data_channel_is_entertainment 1 385.15 35669 46719 < 2.2e-16 ***
# LDA_01 1 40.67 35668 46678 1.801e-10 ***
# LDA_03 1 1.60 35667 46677 0.2058005
#n_non_stop_unique_tokens 1 108.87 35666 46568 < 2.2e-16 ***
# num_hrefs 1 147.18 35665 46421 < 2.2e-16 ***
# kw_max_max 1 56.83 35664 46364 4.761e-14 ***
# num_imgs 1 11.72 35663 46352 0.0006167 ***
# kw_avg_max 1 12.63 35662 46340 0.0003796 ***
# weekday_is_sunday 1 272.29 35661 46067 < 2.2e-16 ***
# average_token_length 1 0.00 35660 46067 0.9597446
#global_rate_positive_words 1 23.08 35659 46044 1.551e-06 ***
# global_subjectivity 1 101.37 35658 45943 < 2.2e-16 ***
# kw_max_min 1 2.95 35657 45940 0.0856799 .
#avg_positive_polarity 1 2.22 35656 45938 0.1360054
#global_rate_negative_words 1 1.32 35655 45936 0.2507089
#avg_negative_polarity 1 0.01 35654 45936 0.9422567
#n_tokens_title 1 0.89 35653 45935 0.3452956
#title_sentiment_polarity 1 24.88 35652 45911 6.099e-07 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#For most of the variables in the model, there is a drop in deviance when adding each variable at a time.
#Among the variables that significantly reduce the residual deviance are closeness to LDA topic 2, followed by
#whether the article was published on Saturday, whether the data channel is entertainment, whether the data channel
#is social media, whether the article was published on Sunday, maximum shares of the average keyword, closeness to
#LDA topic 4, minimum shares of the average keyword, closeness to LDA topic 0, minimum shares of referenced articles in
#Mashable, and so on.
#The McFadden R2 index may be used to assess the model fit
#install.packages("pscl")
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
glm_r2 <- pR2(glm_model)
## fitting null model for pseudo-r2
print(glm_r2)
## llh llhNull G2 McFadden r2ML
## -2.295526e+04 -2.472774e+04 3.544969e+03 7.168000e-02 9.458085e-02
## r2CU
## 1.261150e-01
#alternative format:
print(format(glm_r2, scientific = FALSE))
## llh llhNull G2 McFadden
## "-22955.25744019" "-24727.74190234" " 3544.96892429" " 0.07168000"
## r2ML r2CU
## " 0.09458085" " 0.12611500"
#fitting null model for pseudo-r2
# llh llhNull G2 McFadden r2ML r2CU
# 2.295526e+04 -2.472774e+04 3.544969e+03 7.168000e-02 9.458085e-02 1.261150e-01
#McFadden R2 = 0.07168 may not be a good model fit.
#Make a prediction on the test set
glm_prediction<-predict(glm_model, test, type = "response")
glm_pred_results <- as.factor(ifelse(glm_prediction > 0.5,1,0))
#Confusion Matrix and Statistics
glm_ConfusionMatrix_stats = confusionMatrix(glm_pred_results, test$shares_cat, mode = "everything", positive = "1")
print(glm_ConfusionMatrix_stats)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1334 742
## 1 675 1214
##
## Accuracy : 0.6426
## 95% CI : (0.6275, 0.6576)
## No Information Rate : 0.5067
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.2848
##
## Mcnemar's Test P-Value : 0.07955
##
## Sensitivity : 0.6207
## Specificity : 0.6640
## Pos Pred Value : 0.6427
## Neg Pred Value : 0.6426
## Precision : 0.6427
## Recall : 0.6207
## F1 : 0.6315
## Prevalence : 0.4933
## Detection Rate : 0.3062
## Detection Prevalence : 0.4764
## Balanced Accuracy : 0.6423
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1334 742
# 1 675 1214
#Accuracy : 0.6426
#95% CI : (0.6275, 0.6576)
#No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2e-16
#Kappa : 0.2848
#Mcnemar's Test P-Value : 0.07955
# Sensitivity : 0.6207
# Specificity : 0.6640
# Pos Pred Value : 0.6427
# Neg Pred Value : 0.6426
# Precision : 0.6427
# Recall : 0.6207
# F1 : 0.6315
# Prevalence : 0.4933
# Detection Rate : 0.3062
# Detection Prevalence : 0.4764
# Balanced Accuracy : 0.6423
#Note for statistics found based on the confusion matrix:
#For a 2x2 confusion matrix where
# Reference
#Predicted Event No Event
#Event A B
#No Event C D
#The formulas used are:
#Sensitivity = A/(A+C)
#Specificity = D/(B+D)
#Prevalence = (A+C)/(A+B+C+D)
#PPV = (sensitivity * prevalence)/((sensitivity*prevalence) + ((1-specificity)*(1-prevalence)))
#NPV = (specificity * (1-prevalence))/(((1-sensitivity)*prevalence) + ((specificity)*(1-prevalence)))
#Detection Rate = A/(A+B+C+D)
#Detection Prevalence = (A+B)/(A+B+C+D)
#Balanced Accuracy = (sensitivity+specificity)/2
#Precision = A/(A+B)
#Recall = A/(A+C)
#F1 = (1+beta^2)*precision*recall/((beta^2 * precision)+recall)
#where beta = 1 for this function.
#AUC Score
#Note: a model with good predictive ability should have an Area Under the curve (AUC) closer to 1 than to 0.5.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
glmauc <- roc(test$shares_cat, glm_prediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(glmauc)
##
## Call:
## roc.default(response = test$shares_cat, predictor = glm_prediction)
##
## Data: glm_prediction in 2009 controls (test$shares_cat 0) < 1956 cases (test$shares_cat 1).
## Area under the curve: 0.6883
#Area under the curve is 0.6883
#Plot AUC
plot(glmauc)
#Plot ROC curve
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.2
glmroc_pred <- prediction(glm_prediction, test$shares_cat)
glm_performance <- performance(glmroc_pred, "tpr", "fpr")
plot(glm_performance, colorize=TRUE)
#K-Nearest Neighbours
library("class")
#install.packages("gmodels")
library("gmodels")
##
## Attaching package: 'gmodels'
## The following object is masked from 'package:pROC':
##
## ci
#Remove the share categories column from the training and test datasets
train.set_new <- train.set[-27]
test.set_new <- test.set[-27]
valid.set_new <- valid.set[-27]
train_new <- train[-27]
test_new <- test[-27]
#Store the labels
train.set_labels <- train.set$shares_cat
test.set_labels <- test.set$shares_cat
valid.set_labels <- valid.set$shares_cat
train_labels <- train$shares_cat
test_labels <- test$shares_cat
#Tune the model for different levels of k (number of neighbours)
set.seed(123)
knnfit <- train(x = train.set_new, y = train.set_labels, method = "knn", tuneLength = 10,
trControl = trainControl(method = "cv", search = "grid"))
#Print the training model to see the accuracy
print(knnfit)
## k-Nearest Neighbors
##
## 27750 samples
## 26 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24974, 24975, 24975, 24975, 24975, 24975, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6020179 0.2032993
## 7 0.6069909 0.2132239
## 9 0.6144868 0.2282005
## 11 0.6195678 0.2383678
## 13 0.6232789 0.2457662
## 15 0.6232428 0.2456625
## 17 0.6267745 0.2527195
## 19 0.6275317 0.2542411
## 21 0.6261626 0.2514936
## 23 0.6281806 0.2555366
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
#Resampling results across tuning parameters:
#k Accuracy Kappa
#5 0.6020179 0.2032993
#7 0.6069909 0.2132239
#9 0.6144868 0.2282005
#11 0.6195678 0.2383678
#13 0.6232789 0.2457662
#15 0.6232428 0.2456625
#17 0.6267745 0.2527195
#19 0.6275317 0.2542411
#21 0.6261626 0.2514936
#23 0.6281806 0.2555366
#Accuracy was used to select the optimal model using the largest value.
#The final value used for the model was k = 23.
#Plot the results
plot(knnfit)
#The most accurate predictions are made at k = 23
#Validate and test the model with k = 23
knn_prediction <- knn(train = valid.set_new, test = test.set_new, cl= valid.set_labels, k = 23)
#Confusion Matrix and Statistics
knn_ConfusionMatrix_stats = confusionMatrix(knn_prediction, test.set_labels, mode = "everything", positive = "1")
print(knn_ConfusionMatrix_stats)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1937 1207
## 1 1052 1751
##
## Accuracy : 0.6201
## 95% CI : (0.6077, 0.6325)
## No Information Rate : 0.5026
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2401
##
## Mcnemar's Test P-Value : 0.001195
##
## Sensitivity : 0.5920
## Specificity : 0.6480
## Pos Pred Value : 0.6247
## Neg Pred Value : 0.6161
## Precision : 0.6247
## Recall : 0.5920
## F1 : 0.6079
## Prevalence : 0.4974
## Detection Rate : 0.2944
## Detection Prevalence : 0.4713
## Balanced Accuracy : 0.6200
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1937 1207
# 1 1052 1751
#Accuracy : 0.6201
#95% CI : (0.6077, 0.6325)
#No Information Rate : 0.5026
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.2401
#Mcnemar's Test P-Value : 0.001195
# Sensitivity : 0.5920
# Specificity : 0.6480
# Pos Pred Value : 0.6247
# Neg Pred Value : 0.6161
# Precision : 0.6247
# Recall : 0.5920
# F1 : 0.6079
# Prevalence : 0.4974
# Detection Rate : 0.2944
# Detection Prevalence : 0.4713
# Balanced Accuracy : 0.6200
#AUC Score
knnauc <- roc(test.set$shares_cat, as.numeric(levels(knn_prediction))[as.integer(knn_prediction)])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(knnauc)
##
## Call:
## roc.default(response = test.set$shares_cat, predictor = as.numeric(levels(knn_prediction))[as.integer(knn_prediction)])
##
## Data: as.numeric(levels(knn_prediction))[as.integer(knn_prediction)] in 2989 controls (test.set$shares_cat 0) < 2958 cases (test.set$shares_cat 1).
## Area under the curve: 0.62
#Area under the curve: 0.62
#Plot AUC
plot(knnauc)
#Plot ROC curve
knnroc_pred <- prediction(as.numeric(levels(knn_prediction))[as.integer(knn_prediction)], test.set$shares_cat)
knn_performance <- performance(knnroc_pred, "tpr", "fpr")
plot(knn_performance, colorize=TRUE)
#Using 10-fold Cross-Validation
#For k = 23, make a prediction on the test set.
knn_prediction2 <- knn(train = train_new, test = test_new, cl= train_labels, k = 23)
#Confusion Matrix and Statistics
knn_ConfusionMatrix_stats2 = confusionMatrix(knn_prediction2, test_labels, mode = "everything", positive = "1")
print(knn_ConfusionMatrix_stats2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1335 813
## 1 674 1143
##
## Accuracy : 0.625
## 95% CI : (0.6097, 0.6401)
## No Information Rate : 0.5067
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2491
##
## Mcnemar's Test P-Value : 0.0003453
##
## Sensitivity : 0.5844
## Specificity : 0.6645
## Pos Pred Value : 0.6291
## Neg Pred Value : 0.6215
## Precision : 0.6291
## Recall : 0.5844
## F1 : 0.6059
## Prevalence : 0.4933
## Detection Rate : 0.2883
## Detection Prevalence : 0.4583
## Balanced Accuracy : 0.6244
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1335 813
# 1 674 1143
#Accuracy : 0.625
#95% CI : (0.6097, 0.6401)
#No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.2491
#Mcnemar's Test P-Value : 0.0003453
# Sensitivity : 0.5844
# Specificity : 0.6645
# Pos Pred Value : 0.6291
# Neg Pred Value : 0.6215
# Precision : 0.6291
# Recall : 0.5844
# F1 : 0.6059
# Prevalence : 0.4933
# Detection Rate : 0.2883
# Detection Prevalence : 0.4583
# Balanced Accuracy : 0.6244
#AUC Score
knnauc2 <- roc(test$shares_cat, as.numeric(levels(knn_prediction2))[as.integer(knn_prediction2)])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(knnauc2)
##
## Call:
## roc.default(response = test$shares_cat, predictor = as.numeric(levels(knn_prediction2))[as.integer(knn_prediction2)])
##
## Data: as.numeric(levels(knn_prediction2))[as.integer(knn_prediction2)] in 2009 controls (test$shares_cat 0) < 1956 cases (test$shares_cat 1).
## Area under the curve: 0.6244
#Area under the curve: 0.6244
#Plot AUC
plot(knnauc2)
#Plot ROC curve
knnroc_pred2 <- prediction(as.numeric(levels(knn_prediction2))[as.integer(knn_prediction2)], test$shares_cat)
knn_performance2 <- performance(knnroc_pred2, "tpr", "fpr")
plot(knn_performance2, colorize=TRUE)
#Decision Trees
#install.packages("party")
library("party")
## Warning: package 'party' was built under R version 3.6.2
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
##
## empty
## The following object is masked from 'package:arules':
##
## info
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'party'
## The following object is masked from 'package:arulesCBA':
##
## response
#Tune the model based on mincriterion - the minimum value of the test statistic (1 - p-value)
set.seed(123)
ctreefit <- train(shares_cat ~., data = train.set, method = "ctree", tuneLength = 6,
trControl = trainControl(method = "cv", search = "grid"))
#Print the training model to see the accuracy
print(ctreefit)
## Conditional Inference Tree
##
## 27750 samples
## 26 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24974, 24975, 24975, 24975, 24975, 24975, ...
## Resampling results across tuning parameters:
##
## mincriterion Accuracy Kappa
## 0.010 0.5997473 0.1989281
## 0.206 0.6216938 0.2433777
## 0.402 0.6246488 0.2493046
## 0.598 0.6266667 0.2531889
## 0.794 0.6278202 0.2552605
## 0.990 0.6263789 0.2528885
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.794.
#mincriterion Accuracy Kappa
#0.010 0.5997473 0.1989281
#0.206 0.6216938 0.2433777
#0.402 0.6246488 0.2493046
#0.598 0.6266667 0.2531889
#0.794 0.6278202 0.2552605
#0.990 0.6263789 0.2528885
#Accuracy was used to select the optimal model using the largest value.
#The final value used for the model was mincriterion = 0.794.
#Plot the results
plot(ctreefit)
#The most accurate predictions were made with mincriterion = 0.794
#Since mincriterion = 0.99 is very close in accuracy, this value may be chosen for the model since mincriterion
#is determined by the p-value
#Prune the tree changing the control parameter to mincriterion = 0.99
#Run the model on the validation set
set.seed(123)
ctree_pruned <- ctree(shares_cat ~ ., data = valid.set, controls = ctree_control(mincriterion = 0.99))
#Make a prediction on the test set
ctree_pruned_prediction <- predict(ctree_pruned, test.set)
#Print the decision tree
print(ctree_pruned)
##
## Conditional inference tree with 15 terminal nodes
##
## Response: shares_cat
## Inputs: kw_max_avg, self_reference_min_shares, LDA_00, kw_min_avg, LDA_02, weekday_is_saturday, LDA_04, data_channel_is_socmed, data_channel_is_entertainment, LDA_01, LDA_03, n_non_stop_unique_tokens, num_hrefs, kw_max_max, num_imgs, kw_avg_max, weekday_is_sunday, average_token_length, global_rate_positive_words, global_subjectivity, kw_max_min, avg_positive_polarity, global_rate_negative_words, avg_negative_polarity, n_tokens_title, title_sentiment_polarity
## Number of observations: 5947
##
## 1) LDA_02 <= 0.6314686; criterion = 1, statistic = 138.984
## 2) data_channel_is_entertainment == {1}; criterion = 1, statistic = 91.715
## 3) weekday_is_sunday == {1}; criterion = 1, statistic = 19.241
## 4)* weights = 86
## 3) weekday_is_sunday == {0}
## 5) weekday_is_saturday == {1}; criterion = 0.999, statistic = 17.112
## 6)* weights = 69
## 5) weekday_is_saturday == {0}
## 7) self_reference_min_shares <= 0.002964544; criterion = 0.998, statistic = 16.028
## 8)* weights = 770
## 7) self_reference_min_shares > 0.002964544
## 9)* weights = 194
## 2) data_channel_is_entertainment == {0}
## 10) weekday_is_saturday == {0}; criterion = 1, statistic = 65.636
## 11) num_hrefs <= 0.04276316; criterion = 1, statistic = 53.82
## 12) data_channel_is_socmed == {1}; criterion = 1, statistic = 34.097
## 13) LDA_00 <= 0.2259519; criterion = 0.996, statistic = 14.382
## 14)* weights = 63
## 13) LDA_00 > 0.2259519
## 15)* weights = 169
## 12) data_channel_is_socmed == {0}
## 16) n_non_stop_unique_tokens <= 0.0009482759; criterion = 1, statistic = 26.93
## 17)* weights = 430
## 16) n_non_stop_unique_tokens > 0.0009482759
## 18) weekday_is_sunday == {1}; criterion = 0.999, statistic = 16.896
## 19)* weights = 135
## 18) weekday_is_sunday == {0}
## 20) kw_min_avg <= 0.4634059; criterion = 0.997, statistic = 15.029
## 21)* weights = 1353
## 20) kw_min_avg > 0.4634059
## 22)* weights = 642
## 11) num_hrefs > 0.04276316
## 23)* weights = 873
## 10) weekday_is_saturday == {1}
## 24)* weights = 252
## 1) LDA_02 > 0.6314686
## 25) weekday_is_saturday == {1}; criterion = 0.999, statistic = 18.202
## 26)* weights = 54
## 25) weekday_is_saturday == {0}
## 27) weekday_is_sunday == {0}; criterion = 0.995, statistic = 14.076
## 28)* weights = 801
## 27) weekday_is_sunday == {1}
## 29)* weights = 56
#1) LDA_02 <= 0.6314686; criterion = 1, statistic = 138.984
# 2) data_channel_is_entertainment == {1}; criterion = 1, statistic = 91.715
# 3) weekday_is_sunday == {1}; criterion = 1, statistic = 19.241
# 4)* weights = 86
# 3) weekday_is_sunday == {0}
# 5) weekday_is_saturday == {1}; criterion = 0.999, statistic = 17.112
# 6)* weights = 69
# 5) weekday_is_saturday == {0}
# 7) self_reference_min_shares <= 0.002964544; criterion = 0.998, statistic = 16.028
# 8)* weights = 770
# 7) self_reference_min_shares > 0.002964544
# 9)* weights = 194
# 2) data_channel_is_entertainment == {0}
# 10) weekday_is_saturday == {0}; criterion = 1, statistic = 65.636
# 11) num_hrefs <= 0.04276316; criterion = 1, statistic = 53.82
# 12) data_channel_is_socmed == {1}; criterion = 1, statistic = 34.097
# 13) LDA_00 <= 0.2259519; criterion = 0.996, statistic = 14.382
# 14)* weights = 63
# 13) LDA_00 > 0.2259519
# 15)* weights = 169
# 12) data_channel_is_socmed == {0}
# 16) n_non_stop_unique_tokens <= 0.0009482759; criterion = 1, statistic = 26.93
# 17)* weights = 430
# 16) n_non_stop_unique_tokens > 0.0009482759
# 18) weekday_is_sunday == {1}; criterion = 0.999, statistic = 16.896
# 19)* weights = 135
# 18) weekday_is_sunday == {0}
# 20) kw_min_avg <= 0.4634059; criterion = 0.997, statistic = 15.029
# 21)* weights = 1353
# 20) kw_min_avg > 0.4634059
# 22)* weights = 642
# 11) num_hrefs > 0.04276316
# 23)* weights = 873
# 10) weekday_is_saturday == {1}
# 24)* weights = 252
#1) LDA_02 > 0.6314686
# 25) weekday_is_saturday == {1}; criterion = 0.999, statistic = 18.202
# 26)* weights = 54
# 25) weekday_is_saturday == {0}
# 27) weekday_is_sunday == {0}; criterion = 0.995, statistic = 14.076
# 28)* weights = 801
# 27) weekday_is_sunday == {1}
# 29)* weights = 56
#Plot the decision tree
plot(ctree_pruned, type = "simple")
#At the root node of the tree is closeness to LDA topic 2. This variable will help decide the classes of popularity.
#For instance, based on the decision tree plotted, if closeness to LDA topic 2 is greater than 0.631, check if the day
#the article was published is Saturday.
#If it is Saturday, then 42.6% of the articles are unpopular, and 57.4% of the articles are popular; which is the case
#for 54 samples.
#If the day the article was published is not Saturday, then check if it is Sunday. If it is not Sunday then 71.9% of the
#articles are not popular wherease 28.1% of the articles are popular, and this is the case for 801 samples.
#If it is Sunday, then 48.2% of the articles are unpopular whereas 51.8% of the articles are popular, and this
#is the case for 56 samples.
#Important variables found to help determine whether an article will be popular are closeness to LDA topic 2, whether
#data channel is entertainment, whether the day the article was published is Sunday, whether the day the article was
#published is Saturday, minimum shares of referenced articles in Mashable, number of links, whether the data channel is
#social media, closeness to LDA topic 0, rate of unique non-stop words in the content, minimum shares of the average
#keyword, and so on.
#Confusion Matrix and Statistics
ctree_pruned_ConfusionMatrix_stats = confusionMatrix(ctree_pruned_prediction, test.set$shares_cat, mode = "everything",
positive = "1")
print(ctree_pruned_ConfusionMatrix_stats)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1829 1141
## 1 1160 1817
##
## Accuracy : 0.6131
## 95% CI : (0.6006, 0.6255)
## No Information Rate : 0.5026
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.2262
##
## Mcnemar's Test P-Value : 0.7075
##
## Sensitivity : 0.6143
## Specificity : 0.6119
## Pos Pred Value : 0.6103
## Neg Pred Value : 0.6158
## Precision : 0.6103
## Recall : 0.6143
## F1 : 0.6123
## Prevalence : 0.4974
## Detection Rate : 0.3055
## Detection Prevalence : 0.5006
## Balanced Accuracy : 0.6131
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1829 1141
# 1 1160 1817
#Accuracy : 0.6131
#95% CI : (0.6006, 0.6255)
#No Information Rate : 0.5026
#P-Value [Acc > NIR] : <2e-16
#Kappa : 0.2262
#Mcnemar's Test P-Value : 0.7075
# Sensitivity : 0.6143
# Specificity : 0.6119
# Pos Pred Value : 0.6103
# Neg Pred Value : 0.6158
# Precision : 0.6103
# Recall : 0.6143
# F1 : 0.6123
# Prevalence : 0.4974
# Detection Rate : 0.3055
# Detection Prevalence : 0.5006
# Balanced Accuracy : 0.6131
#AUC Score
ctreeauc <- roc(test.set$shares_cat, as.numeric(levels(ctree_pruned_prediction))[as.integer(ctree_pruned_prediction)])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(ctreeauc)
##
## Call:
## roc.default(response = test.set$shares_cat, predictor = as.numeric(levels(ctree_pruned_prediction))[as.integer(ctree_pruned_prediction)])
##
## Data: as.numeric(levels(ctree_pruned_prediction))[as.integer(ctree_pruned_prediction)] in 2989 controls (test.set$shares_cat 0) < 2958 cases (test.set$shares_cat 1).
## Area under the curve: 0.6131
#Area under the curve: 0.6131
#Plot AUC
plot(ctreeauc)
#Plot ROC curve
ctreeroc_pred <- prediction(as.numeric(levels(ctree_pruned_prediction))[as.integer(ctree_pruned_prediction)], test.set$shares_cat)
ctree_performance <- performance(ctreeroc_pred, "tpr", "fpr")
plot(ctree_performance, colorize=TRUE)
#Using 10-fold cross-validation
#Create a decision tree model on the training set using the best parameters
set.seed(123)
ctree_model <- ctree(shares_cat ~. , data = train, controls = ctree_control(mincriterion = 0.99))
#Print the decision tree
print(ctree_model)
##
## Conditional inference tree with 46 terminal nodes
##
## Response: shares_cat
## Inputs: kw_max_avg, self_reference_min_shares, LDA_00, kw_min_avg, LDA_02, weekday_is_saturday, LDA_04, data_channel_is_socmed, data_channel_is_entertainment, LDA_01, LDA_03, n_non_stop_unique_tokens, num_hrefs, kw_max_max, num_imgs, kw_avg_max, weekday_is_sunday, average_token_length, global_rate_positive_words, global_subjectivity, kw_max_min, avg_positive_polarity, global_rate_negative_words, avg_negative_polarity, n_tokens_title, title_sentiment_polarity
## Number of observations: 35679
##
## 1) LDA_02 <= 0.5968011; criterion = 1, statistic = 869.307
## 2) data_channel_is_entertainment == {0}; criterion = 1, statistic = 736.054
## 3) weekday_is_saturday == {0}; criterion = 1, statistic = 279.203
## 4) kw_min_avg <= 0.6040266; criterion = 1, statistic = 216.583
## 5) data_channel_is_socmed == {0}; criterion = 1, statistic = 161.159
## 6) num_hrefs <= 0.04605263; criterion = 1, statistic = 149.883
## 7) weekday_is_sunday == {1}; criterion = 1, statistic = 113.296
## 8) LDA_01 <= 0.04319949; criterion = 0.999, statistic = 16.598
## 9)* weights = 466
## 8) LDA_01 > 0.04319949
## 10)* weights = 279
## 7) weekday_is_sunday == {0}
## 11) LDA_00 <= 0.2589749; criterion = 1, statistic = 54.745
## 12) LDA_02 <= 0.3881332; criterion = 1, statistic = 54.484
## 13) n_non_stop_unique_tokens <= 0.001315496; criterion = 1, statistic = 18.327
## 14)* weights = 5574
## 13) n_non_stop_unique_tokens > 0.001315496
## 15) global_rate_negative_words <= 0.1430531; criterion = 1, statistic = 22.144
## 16) n_tokens_title <= 0.4285714; criterion = 1, statistic = 20.052
## 17)* weights = 256
## 16) n_tokens_title > 0.4285714
## 18)* weights = 137
## 15) global_rate_negative_words > 0.1430531
## 19)* weights = 114
## 12) LDA_02 > 0.3881332
## 20) global_rate_positive_words <= 0.1843732; criterion = 0.997, statistic = 14.723
## 21)* weights = 407
## 20) global_rate_positive_words > 0.1843732
## 22)* weights = 739
## 11) LDA_00 > 0.2589749
## 23) global_rate_positive_words <= 0.2371013; criterion = 1, statistic = 21.81
## 24)* weights = 1739
## 23) global_rate_positive_words > 0.2371013
## 25) global_subjectivity <= 0.4190236; criterion = 1, statistic = 25.692
## 26)* weights = 856
## 25) global_subjectivity > 0.4190236
## 27)* weights = 1687
## 6) num_hrefs > 0.04605263
## 28) global_subjectivity <= 0.4068627; criterion = 1, statistic = 23.36
## 29)* weights = 485
## 28) global_subjectivity > 0.4068627
## 30) weekday_is_sunday == {1}; criterion = 1, statistic = 21.034
## 31) LDA_01 <= 0.6242477; criterion = 0.993, statistic = 13.16
## 32)* weights = 249
## 31) LDA_01 > 0.6242477
## 33)* weights = 17
## 30) weekday_is_sunday == {0}
## 34) kw_max_avg <= 0.0121861; criterion = 0.994, statistic = 13.527
## 35) LDA_04 <= 0.7096185; criterion = 0.994, statistic = 13.438
## 36)* weights = 280
## 35) LDA_04 > 0.7096185
## 37)* weights = 193
## 34) kw_max_avg > 0.0121861
## 38)* weights = 1965
## 5) data_channel_is_socmed == {1}
## 39) LDA_00 <= 0.1358852; criterion = 1, statistic = 31.932
## 40)* weights = 266
## 39) LDA_00 > 0.1358852
## 41) n_non_stop_unique_tokens <= 0.001144902; criterion = 1, statistic = 21.664
## 42)* weights = 647
## 41) n_non_stop_unique_tokens > 0.001144902
## 43)* weights = 224
## 4) kw_min_avg > 0.6040266
## 44) kw_avg_max <= 0.5277778; criterion = 1, statistic = 92.725
## 45) LDA_00 <= 0.7223975; criterion = 1, statistic = 38.217
## 46) LDA_01 <= 0.03611333; criterion = 0.999, statistic = 18.082
## 47) data_channel_is_socmed == {1}; criterion = 0.994, statistic = 13.595
## 48) n_non_stop_unique_tokens <= 0.001164241; criterion = 0.997, statistic = 14.789
## 49)* weights = 128
## 48) n_non_stop_unique_tokens > 0.001164241
## 50)* weights = 33
## 47) data_channel_is_socmed == {0}
## 51)* weights = 1719
## 46) LDA_01 > 0.03611333
## 52) kw_min_avg <= 0.6597951; criterion = 0.996, statistic = 14.259
## 53)* weights = 263
## 52) kw_min_avg > 0.6597951
## 54)* weights = 1195
## 45) LDA_00 > 0.7223975
## 55)* weights = 475
## 44) kw_avg_max > 0.5277778
## 56)* weights = 1565
## 3) weekday_is_saturday == {1}
## 57) LDA_00 <= 0.3962087; criterion = 1, statistic = 27.009
## 58) LDA_01 <= 0.04320616; criterion = 0.998, statistic = 16.015
## 59)* weights = 725
## 58) LDA_01 > 0.04320616
## 60)* weights = 391
## 57) LDA_00 > 0.3962087
## 61)* weights = 355
## 2) data_channel_is_entertainment == {1}
## 62) weekday_is_sunday == {1}; criterion = 1, statistic = 113.347
## 63)* weights = 483
## 62) weekday_is_sunday == {0}
## 64) weekday_is_saturday == {0}; criterion = 1, statistic = 72.14
## 65) kw_max_avg <= 0.01303062; criterion = 1, statistic = 38.531
## 66) kw_max_max <= 0.7327167; criterion = 1, statistic = 28.103
## 67)* weights = 308
## 66) kw_max_max > 0.7327167
## 68)* weights = 1572
## 65) kw_max_avg > 0.01303062
## 69) self_reference_min_shares <= 0.003320289; criterion = 1, statistic = 24.623
## 70)* weights = 2859
## 69) self_reference_min_shares > 0.003320289
## 71)* weights = 691
## 64) weekday_is_saturday == {1}
## 72)* weights = 325
## 1) LDA_02 > 0.5968011
## 73) data_channel_is_socmed == {1}; criterion = 1, statistic = 88.108
## 74)* weights = 223
## 73) data_channel_is_socmed == {0}
## 75) weekday_is_saturday == {0}; criterion = 1, statistic = 57.96
## 76) num_imgs <= 0.0546875; criterion = 1, statistic = 58.131
## 77) weekday_is_sunday == {0}; criterion = 1, statistic = 24.364
## 78) kw_max_max <= 0.7327167; criterion = 1, statistic = 18.694
## 79)* weights = 439
## 78) kw_max_max > 0.7327167
## 80) num_hrefs <= 0.08881579; criterion = 1, statistic = 21.495
## 81) LDA_02 <= 0.7970923; criterion = 0.997, statistic = 15.097
## 82)* weights = 1387
## 81) LDA_02 > 0.7970923
## 83) self_reference_min_shares <= 0.002253053; criterion = 0.996, statistic = 14.514
## 84)* weights = 2128
## 83) self_reference_min_shares > 0.002253053
## 85)* weights = 474
## 80) num_hrefs > 0.08881579
## 86)* weights = 129
## 77) weekday_is_sunday == {1}
## 87)* weights = 337
## 76) num_imgs > 0.0546875
## 88)* weights = 515
## 75) weekday_is_saturday == {1}
## 89) kw_max_max <= 0.8186885; criterion = 0.999, statistic = 17.163
## 90)* weights = 43
## 89) kw_max_max > 0.8186885
## 91)* weights = 337
#1) LDA_02 <= 0.5968011; criterion = 1, statistic = 869.307
# 2) data_channel_is_entertainment == {0}; criterion = 1, statistic = 736.054
# 3) weekday_is_saturday == {0}; criterion = 1, statistic = 279.203
# 4) kw_min_avg <= 0.6040266; criterion = 1, statistic = 216.583
# 5) data_channel_is_socmed == {0}; criterion = 1, statistic = 161.159
# 6) num_hrefs <= 0.04605263; criterion = 1, statistic = 149.883
# 7) weekday_is_sunday == {1}; criterion = 1, statistic = 113.296
# 8) LDA_01 <= 0.04319949; criterion = 0.999, statistic = 16.598
# 9)* weights = 466
# 8) LDA_01 > 0.04319949
# 10)* weights = 279
# 7) weekday_is_sunday == {0}
# 11) LDA_00 <= 0.2589749; criterion = 1, statistic = 54.745
# 12) LDA_02 <= 0.3881332; criterion = 1, statistic = 54.484
# 13) n_non_stop_unique_tokens <= 0.001315496; criterion = 1, statistic = 18.327
# 14)* weights = 5574
# 13) n_non_stop_unique_tokens > 0.001315496
# 15) global_rate_negative_words <= 0.1430531; criterion = 1, statistic = 22.144
# 16) n_tokens_title <= 0.4285714; criterion = 1, statistic = 20.052
# 17)* weights = 256
# 16) n_tokens_title > 0.4285714
# 18)* weights = 137
# 15) global_rate_negative_words > 0.1430531
# 19)* weights = 114
# 12) LDA_02 > 0.3881332
# 20) global_rate_positive_words <= 0.1843732; criterion = 0.997, statistic = 14.723
# 21)* weights = 407
# 20) global_rate_positive_words > 0.1843732
# 22)* weights = 739
# 11) LDA_00 > 0.2589749
# 23) global_rate_positive_words <= 0.2371013; criterion = 1, statistic = 21.81
# 24)* weights = 1739
# 23) global_rate_positive_words > 0.2371013
# 25) global_subjectivity <= 0.4190236; criterion = 1, statistic = 25.692
# 26)* weights = 856
# 25) global_subjectivity > 0.4190236
# 27)* weights = 1687
# 6) num_hrefs > 0.04605263
# 28) global_subjectivity <= 0.4068627; criterion = 1, statistic = 23.36
# 29)* weights = 485
# 28) global_subjectivity > 0.4068627
# 30) weekday_is_sunday == {1}; criterion = 1, statistic = 21.034
# 31) LDA_01 <= 0.6242477; criterion = 0.993, statistic = 13.16
# 32)* weights = 249
# 31) LDA_01 > 0.6242477
# 33)* weights = 17
# 30) weekday_is_sunday == {0}
# 34) kw_max_avg <= 0.0121861; criterion = 0.994, statistic = 13.527
# 35) LDA_04 <= 0.7096185; criterion = 0.994, statistic = 13.438
# 36)* weights = 280
# 35) LDA_04 > 0.7096185
# 37)* weights = 193
# 34) kw_max_avg > 0.0121861
# 38)* weights = 1965
# 5) data_channel_is_socmed == {1}
# 39) LDA_00 <= 0.1358852; criterion = 1, statistic = 31.932
# 40)* weights = 266
# 39) LDA_00 > 0.1358852
# 41) n_non_stop_unique_tokens <= 0.001144902; criterion = 1, statistic = 21.664
# 42)* weights = 647
# 41) n_non_stop_unique_tokens > 0.001144902
# 43)* weights = 224
# 4) kw_min_avg > 0.6040266
# 44) kw_avg_max <= 0.5277778; criterion = 1, statistic = 92.725
# 45) LDA_00 <= 0.7223975; criterion = 1, statistic = 38.217
# 46) LDA_01 <= 0.03611333; criterion = 0.999, statistic = 18.082
# 47) data_channel_is_socmed == {1}; criterion = 0.994, statistic = 13.595
# 48) n_non_stop_unique_tokens <= 0.001164241; criterion = 0.997, statistic = 14.789
# 49)* weights = 128
# 48) n_non_stop_unique_tokens > 0.001164241
# 50)* weights = 33
# 47) data_channel_is_socmed == {0}
# 51)* weights = 1719
# 46) LDA_01 > 0.03611333
# 52) kw_min_avg <= 0.6597951; criterion = 0.996, statistic = 14.259
# 53)* weights = 263
# 52) kw_min_avg > 0.6597951
# 54)* weights = 1195
# 45) LDA_00 > 0.7223975
# 55)* weights = 475
# 44) kw_avg_max > 0.5277778
# 56)* weights = 1565
# 3) weekday_is_saturday == {1}
# 57) LDA_00 <= 0.3962087; criterion = 1, statistic = 27.009
# 58) LDA_01 <= 0.04320616; criterion = 0.998, statistic = 16.015
# 59)* weights = 725
# 58) LDA_01 > 0.04320616
# 60)* weights = 391
# 57) LDA_00 > 0.3962087
# 61)* weights = 355
# 2) data_channel_is_entertainment == {1}
# 62) weekday_is_sunday == {1}; criterion = 1, statistic = 113.347
# 63)* weights = 483
# 62) weekday_is_sunday == {0}
# 64) weekday_is_saturday == {0}; criterion = 1, statistic = 72.14
# 65) kw_max_avg <= 0.01303062; criterion = 1, statistic = 38.531
# 66) kw_max_max <= 0.7327167; criterion = 1, statistic = 28.103
# 67)* weights = 308
# 66) kw_max_max > 0.7327167
# 68)* weights = 1572
# 65) kw_max_avg > 0.01303062
# 69) self_reference_min_shares <= 0.003320289; criterion = 1, statistic = 24.623
# 70)* weights = 2859
# 69) self_reference_min_shares > 0.003320289
# 71)* weights = 691
# 64) weekday_is_saturday == {1}
# 72)* weights = 325
#1) LDA_02 > 0.5968011
# 73) data_channel_is_socmed == {1}; criterion = 1, statistic = 88.108
# 74)* weights = 223
# 73) data_channel_is_socmed == {0}
# 75) weekday_is_saturday == {0}; criterion = 1, statistic = 57.96
# 76) num_imgs <= 0.0546875; criterion = 1, statistic = 58.131
# 77) weekday_is_sunday == {0}; criterion = 1, statistic = 24.364
# 78) kw_max_max <= 0.7327167; criterion = 1, statistic = 18.694
# 79)* weights = 439
# 78) kw_max_max > 0.7327167
# 80) num_hrefs <= 0.08881579; criterion = 1, statistic = 21.495
# 81) LDA_02 <= 0.7970923; criterion = 0.997, statistic = 15.097
# 82)* weights = 1387
# 81) LDA_02 > 0.7970923
# 83) self_reference_min_shares <= 0.002253053; criterion = 0.996, statistic = 14.514
# 84)* weights = 2128
# 83) self_reference_min_shares > 0.002253053
# 85)* weights = 474
# 80) num_hrefs > 0.08881579
# 86)* weights = 129
# 77) weekday_is_sunday == {1}
# 87)* weights = 337
# 76) num_imgs > 0.0546875
# 88)* weights = 515
# 75) weekday_is_saturday == {1}
# 89) kw_max_max <= 0.8186885; criterion = 0.999, statistic = 17.163
# 90)* weights = 43
# 89) kw_max_max > 0.8186885
# 91)* weights = 337
#Plot the decision tree
plot(ctree_model, type = "simple")
#Important variables found to help determine whether an article will be popular are closeness to LDA topic 2
#(statistic = 869.307), followed by whether the data channel is entertainment (statistic = 736.054), whether the day the
#article was published is Saturday (statistic = 279.203), minimum shares of the average keyword (statistic = 216.583),
#whether the data channel is social media (statistic = 161.159), number of links (statistic = 149.883), whether the day
#the article was posted is Sunday (statistic = 113.296), average shares of the best keyword (statistic = 92.725),
#number of images (statistic = 58.131), closeness to LDA topic 0 (statistic = 54.745), and so on.
#Make a prediction on the test set.
ctree_prediction <- predict(ctree_model, test)
#Confusion Matrix and Statistics
ctree_model_ConfusionMatrix_stats = confusionMatrix(ctree_prediction, test$shares_cat, mode = "everything",
positive = "1")
print(ctree_model_ConfusionMatrix_stats)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1212 679
## 1 797 1277
##
## Accuracy : 0.6277
## 95% CI : (0.6125, 0.6428)
## No Information Rate : 0.5067
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2559
##
## Mcnemar's Test P-Value : 0.002324
##
## Sensitivity : 0.6529
## Specificity : 0.6033
## Pos Pred Value : 0.6157
## Neg Pred Value : 0.6409
## Precision : 0.6157
## Recall : 0.6529
## F1 : 0.6337
## Prevalence : 0.4933
## Detection Rate : 0.3221
## Detection Prevalence : 0.5231
## Balanced Accuracy : 0.6281
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1212 679
# 1 797 1277
#Accuracy : 0.6277
#95% CI : (0.6125, 0.6428)
#No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.2559
#Mcnemar's Test P-Value : 0.002324
# Sensitivity : 0.6529
# Specificity : 0.6033
# Pos Pred Value : 0.6157
# Neg Pred Value : 0.6409
# Precision : 0.6157
# Recall : 0.6529
# F1 : 0.6337
# Prevalence : 0.4933
# Detection Rate : 0.3221
# Detection Prevalence : 0.5231
# Balanced Accuracy : 0.6281
#AUC Score
ctreeauc2 <- roc(test$shares_cat, as.numeric(levels(ctree_prediction))[as.integer(ctree_prediction)])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(ctreeauc2)
##
## Call:
## roc.default(response = test$shares_cat, predictor = as.numeric(levels(ctree_prediction))[as.integer(ctree_prediction)])
##
## Data: as.numeric(levels(ctree_prediction))[as.integer(ctree_prediction)] in 2009 controls (test$shares_cat 0) < 1956 cases (test$shares_cat 1).
## Area under the curve: 0.6281
#Area under the curve: 0.6281
#Plot AUC
plot(ctreeauc2)
#Plot ROC curve
ctreeroc_pred2 <- prediction(as.numeric(levels(ctree_prediction))[as.integer(ctree_prediction)], test$shares_cat)
ctree_performance2 <- performance(ctreeroc_pred2, "tpr", "fpr")
plot(ctree_performance2, colorize=TRUE)
#Random Forest
#Tune the model with different mtry parameters, i.e. the number of variables randomly sampled as candidates at each split
set.seed(123)
rffit <- train(shares_cat ~., data = train.set, method = "rf", tuneLength = 3,
trControl = trainControl(method = "cv", search = "grid"))
#Print the training model to see the accuracy
print(rffit)
## Random Forest
##
## 27750 samples
## 26 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24974, 24975, 24975, 24975, 24975, 24975, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.6622345 0.3242408
## 14 0.6541983 0.3082772
## 26 0.6520358 0.3039446
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#mtry Accuracy Kappa
# 2 0.6622345 0.3242408
#14 0.6541983 0.3082772
#26 0.6520358 0.3039446
#Accuracy was used to select the optimal model using the largest value.
#The final value used for the model was mtry = 2.
#Plot the results
#Visualize the accuracy based on the value of mtry.
plot(rffit)
#Create a Random Forest model using the validation set with mtry = 2
set.seed(123)
rf_model <- randomForest(shares_cat ~., data = valid.set, mtry = 2, importance = TRUE)
#Print the model
print(rf_model)
##
## Call:
## randomForest(formula = shares_cat ~ ., data = valid.set, mtry = 2, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 34.81%
## Confusion matrix:
## 0 1 class.error
## 0 2060 982 0.3228139
## 1 1088 1817 0.3745267
#Number of trees: 500
#No. of variables tried at each split: 2
#OOB estimate of error rate: 34.81%
#Confusion matrix:
# 0 1 class.error
#0 2060 982 0.3228139
#1 1088 1817 0.3745267
#Plot the results
plot(rf_model)
#The errors from the model decrease until around 200 trees and then flatten.
#Make a prediction on the test set
rf_predict <- predict(rf_model, test.set, type = "class")
#Confusion Matrix and Statistics
rf_confusionMatrix_stats <- confusionMatrix(rf_predict, test.set$shares_cat, mode = "everything", positive = "1")
print(rf_confusionMatrix_stats)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1990 1120
## 1 999 1838
##
## Accuracy : 0.6437
## 95% CI : (0.6314, 0.6559)
## No Information Rate : 0.5026
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2872
##
## Mcnemar's Test P-Value : 0.009138
##
## Sensitivity : 0.6214
## Specificity : 0.6658
## Pos Pred Value : 0.6479
## Neg Pred Value : 0.6399
## Precision : 0.6479
## Recall : 0.6214
## F1 : 0.6343
## Prevalence : 0.4974
## Detection Rate : 0.3091
## Detection Prevalence : 0.4770
## Balanced Accuracy : 0.6436
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1990 1120
# 1 999 1838
#Accuracy : 0.6437
#95% CI : (0.6314, 0.6559)
#No Information Rate : 0.5026
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.2872
#Mcnemar's Test P-Value : 0.009138
# Sensitivity : 0.6214
# Specificity : 0.6658
# Pos Pred Value : 0.6479
# Neg Pred Value : 0.6399
# Precision : 0.6479
# Recall : 0.6214
# F1 : 0.6343
# Prevalence : 0.4974
# Detection Rate : 0.3091
# Detection Prevalence : 0.4770
# Balanced Accuracy : 0.6436
#AUC Score
rfauc <- roc(test.set$shares_cat, as.numeric(levels(rf_predict))[as.integer(rf_predict)])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(rfauc)
##
## Call:
## roc.default(response = test.set$shares_cat, predictor = as.numeric(levels(rf_predict))[as.integer(rf_predict)])
##
## Data: as.numeric(levels(rf_predict))[as.integer(rf_predict)] in 2989 controls (test.set$shares_cat 0) < 2958 cases (test.set$shares_cat 1).
## Area under the curve: 0.6436
#Area under the curve: 0.6436
#Plot AUC
plot(rfauc)
#Plot ROC curve
rfroc_pred <- prediction(as.numeric(levels(rf_predict))[as.integer(rf_predict)], test.set$shares_cat)
rf_performance <- performance(rfroc_pred, "tpr", "fpr")
plot(rf_performance, colorize=TRUE)
#Check the important variables
importance(rf_model)
## 0 1 MeanDecreaseAccuracy
## kw_max_avg 12.7310120 14.75961893 20.702820
## self_reference_min_shares 17.0067342 13.77551416 20.478818
## LDA_00 8.7836745 7.25108839 12.232652
## kw_min_avg 20.4511226 0.80879881 17.378091
## LDA_02 18.6280357 2.52206351 17.332176
## weekday_is_saturday 21.3061769 11.56706968 21.518318
## LDA_04 10.8319983 6.07636732 14.161131
## data_channel_is_socmed 7.9607130 13.33131301 14.927426
## data_channel_is_entertainment 13.1381832 2.26272992 13.196795
## LDA_01 11.7698979 3.00109954 12.526222
## LDA_03 13.4850624 -0.74903440 11.002887
## n_non_stop_unique_tokens 11.4177800 3.82414586 12.006379
## num_hrefs 6.8585193 5.83000026 10.415695
## kw_max_max -0.1260238 7.04914180 5.672202
## num_imgs 10.3045425 0.96437227 9.694474
## kw_avg_max 8.2897306 0.05420312 6.925046
## weekday_is_sunday 8.7984839 6.78159179 10.237964
## average_token_length 8.4711551 -2.06198892 5.163225
## global_rate_positive_words 6.4144414 5.95345343 9.270298
## global_subjectivity 3.5315145 5.46060541 6.911155
## kw_max_min 9.4879264 3.51314322 9.712763
## avg_positive_polarity -0.8552206 6.95723819 4.330493
## global_rate_negative_words 4.9363931 1.39243881 4.714351
## avg_negative_polarity 2.8657762 2.33514021 3.765739
## n_tokens_title 2.3541071 1.97842684 3.008126
## title_sentiment_polarity 3.5005870 2.85885534 4.599657
## MeanDecreaseGini
## kw_max_avg 174.01898
## self_reference_min_shares 155.88233
## LDA_00 140.88378
## kw_min_avg 123.40070
## LDA_02 156.80113
## weekday_is_saturday 31.44492
## LDA_04 142.02218
## data_channel_is_socmed 19.59069
## data_channel_is_entertainment 23.36260
## LDA_01 136.59978
## LDA_03 134.73511
## n_non_stop_unique_tokens 145.02502
## num_hrefs 116.62004
## kw_max_max 39.92993
## num_imgs 79.16993
## kw_avg_max 136.95709
## weekday_is_sunday 18.43724
## average_token_length 129.97954
## global_rate_positive_words 131.54903
## global_subjectivity 135.10063
## kw_max_min 137.69209
## avg_positive_polarity 128.60078
## global_rate_negative_words 128.23493
## avg_negative_polarity 125.90124
## n_tokens_title 83.84260
## title_sentiment_polarity 87.11326
# 0 1 MeanDecreaseAccuracy MeanDecreaseGini
#kw_max_avg 12.7310120 14.75961893 20.702820 174.01898
#self_reference_min_shares 17.0067342 13.77551416 20.478818 155.88233
#LDA_00 8.7836745 7.25108839 12.232652 140.88378
#kw_min_avg 20.4511226 0.80879881 17.378091 123.40070
#LDA_02 18.6280357 2.52206351 17.332176 156.80113
#weekday_is_saturday 21.3061769 11.56706968 21.518318 31.44492
#LDA_04 10.8319983 6.07636732 14.161131 142.02218
#data_channel_is_socmed 7.9607130 13.33131301 14.927426 19.59069
#data_channel_is_entertainment 13.1381832 2.26272992 13.196795 23.36260
#LDA_01 11.7698979 3.00109954 12.526222 136.59978
#LDA_03 13.4850624 -0.74903440 11.002887 134.73511
#n_non_stop_unique_tokens 11.4177800 3.82414586 12.006379 145.02502
#num_hrefs 6.8585193 5.83000026 10.415695 116.62004
#kw_max_max -0.1260238 7.04914180 5.672202 39.92993
#num_imgs 10.3045425 0.96437227 9.694474 79.16993
#kw_avg_max 8.2897306 0.05420312 6.925046 136.95709
#weekday_is_sunday 8.7984839 6.78159179 10.237964 18.43724
#average_token_length 8.4711551 -2.06198892 5.163225 129.97954
#global_rate_positive_words 6.4144414 5.95345343 9.270298 131.54903
#global_subjectivity 3.5315145 5.46060541 6.911155 135.10063
#kw_max_min 9.4879264 3.51314322 9.712763 137.69209
#avg_positive_polarity -0.8552206 6.95723819 4.330493 128.60078
#global_rate_negative_words 4.9363931 1.39243881 4.714351 128.23493
#avg_negative_polarity 2.8657762 2.33514021 3.765739 125.90124
#n_tokens_title 2.3541071 1.97842684 3.008126 83.84260
#title_sentiment_polarity 3.5005870 2.85885534 4.599657 87.11326
#Evaluate variable importance according to mean decrease in accuracy and mean decrease in gini
#Mean Decrease in Accuracy
rf_imp_class = importance(rf_model, type=1)
rf_imp_class <- data.frame(predictors=rownames(rf_imp_class),rf_imp_class)
#Mean Decrease Gini
rf_imp_gini = importance(rf_model, type=2)
rf_imp_gini <- data.frame(predictors=rownames(rf_imp_gini),rf_imp_gini)
#Order the predictors by importance according to Mean Decrease in Accuracy
rf_imp_class.sort <- arrange(rf_imp_class, desc(MeanDecreaseAccuracy))
rf_imp_class.sort$predictors <- factor(rf_imp_class.sort$predictors,levels=rf_imp_class.sort$predictors)
#Print the predictors sorted by Mean Decrease in Accuracy
print(rf_imp_class.sort)
## predictors MeanDecreaseAccuracy
## 1 weekday_is_saturday 21.518318
## 2 kw_max_avg 20.702820
## 3 self_reference_min_shares 20.478818
## 4 kw_min_avg 17.378091
## 5 LDA_02 17.332176
## 6 data_channel_is_socmed 14.927426
## 7 LDA_04 14.161131
## 8 data_channel_is_entertainment 13.196795
## 9 LDA_01 12.526222
## 10 LDA_00 12.232652
## 11 n_non_stop_unique_tokens 12.006379
## 12 LDA_03 11.002887
## 13 num_hrefs 10.415695
## 14 weekday_is_sunday 10.237964
## 15 kw_max_min 9.712763
## 16 num_imgs 9.694474
## 17 global_rate_positive_words 9.270298
## 18 kw_avg_max 6.925046
## 19 global_subjectivity 6.911155
## 20 kw_max_max 5.672202
## 21 average_token_length 5.163225
## 22 global_rate_negative_words 4.714351
## 23 title_sentiment_polarity 4.599657
## 24 avg_positive_polarity 4.330493
## 25 avg_negative_polarity 3.765739
## 26 n_tokens_title 3.008126
# predictors MeanDecreaseAccuracy
#1 weekday_is_saturday 21.518318
#2 kw_max_avg 20.702820
#3 self_reference_min_shares 20.478818
#4 kw_min_avg 17.378091
#5 LDA_02 17.332176
#6 data_channel_is_socmed 14.927426
#7 LDA_04 14.161131
#8 data_channel_is_entertainment 13.196795
#9 LDA_01 12.526222
#10 LDA_00 12.232652
#11 n_non_stop_unique_tokens 12.006379
#12 LDA_03 11.002887
#13 num_hrefs 10.415695
#14 weekday_is_sunday 10.237964
#15 kw_max_min 9.712763
#16 num_imgs 9.694474
#17 global_rate_positive_words 9.270298
#18 kw_avg_max 6.925046
#19 global_subjectivity 6.911155
#20 kw_max_max 5.672202
#21 average_token_length 5.163225
#22 global_rate_negative_words 4.714351
#23 title_sentiment_polarity 4.599657
#24 avg_positive_polarity 4.330493
#25 avg_negative_polarity 3.765739
#26 n_tokens_title 3.008126
#The top predictors in terms of Mean Decrease in Accuracy include whether the article was published on Saturday, maximum
#shares of the average keyword, minimum shares of referenced articles in Mashable, minimum shares of the average keyword,
#closeness to LDA topic 2, whether the data channel is social media, closeness to LDA topic 4, whether the data channel
#is entertainment, closeness to LDA topic 1, closeness to LDA topic 0, and so on.
#Order the predictors by importance according to Mean Decrease in Gini
rf_imp_gini.sort <- arrange(rf_imp_gini, desc(rf_imp_gini$MeanDecreaseGini))
rf_imp_gini.sort$predictors <- factor(rf_imp_gini.sort$predictors,levels=rf_imp_gini.sort$predictors)
#Print the predictors sorted by Mean Decrease in Gini
print(rf_imp_gini.sort)
## predictors MeanDecreaseGini
## 1 kw_max_avg 174.01898
## 2 LDA_02 156.80113
## 3 self_reference_min_shares 155.88233
## 4 n_non_stop_unique_tokens 145.02502
## 5 LDA_04 142.02218
## 6 LDA_00 140.88378
## 7 kw_max_min 137.69209
## 8 kw_avg_max 136.95709
## 9 LDA_01 136.59978
## 10 global_subjectivity 135.10063
## 11 LDA_03 134.73511
## 12 global_rate_positive_words 131.54903
## 13 average_token_length 129.97954
## 14 avg_positive_polarity 128.60078
## 15 global_rate_negative_words 128.23493
## 16 avg_negative_polarity 125.90124
## 17 kw_min_avg 123.40070
## 18 num_hrefs 116.62004
## 19 title_sentiment_polarity 87.11326
## 20 n_tokens_title 83.84260
## 21 num_imgs 79.16993
## 22 kw_max_max 39.92993
## 23 weekday_is_saturday 31.44492
## 24 data_channel_is_entertainment 23.36260
## 25 data_channel_is_socmed 19.59069
## 26 weekday_is_sunday 18.43724
# predictors MeanDecreaseGini
#1 kw_max_avg 174.01898
#2 LDA_02 156.80113
#3 self_reference_min_shares 155.88233
#4 n_non_stop_unique_tokens 145.02502
#5 LDA_04 142.02218
#6 LDA_00 140.88378
#7 kw_max_min 137.69209
#8 kw_avg_max 136.95709
#9 LDA_01 136.59978
#10 global_subjectivity 135.10063
#11 LDA_03 134.73511
#12 global_rate_positive_words 131.54903
#13 average_token_length 129.97954
#14 avg_positive_polarity 128.60078
#15 global_rate_negative_words 128.23493
#16 avg_negative_polarity 125.90124
#17 kw_min_avg 123.40070
#18 num_hrefs 116.62004
#19 title_sentiment_polarity 87.11326
#20 n_tokens_title 83.84260
#21 num_imgs 79.16993
#22 kw_max_max 39.92993
#23 weekday_is_saturday 31.44492
#24 data_channel_is_entertainment 23.36260
#25 data_channel_is_socmed 19.59069
#26 weekday_is_sunday 18.43724
#The top predictors in terms of Mean Decrease in Gini include maximum shares of the average keyword, closeness to LDA
#topic 2, minimum shares of referenced articles in Mashable, rate of unique non-stop words in the content, closeness to
#LDA topic 4, closeness to LDA topic 0, maximum shares of the worst keyword, average shares of the best keyword,
#closeness to LDA topic 1, text subjectivity, and so on.
#Plot Important Variables
varImpPlot(rf_model, type=1, main = "Variable Importance (Mean Decrease in Accuracy)")
varImpPlot(rf_model, type=2, main = "Variable Importance (Mean Decrease in Gini)")
varImpPlot(rf_model, main = "Variable Importance")
#install.packages("vip")
library(vip)
## Warning: package 'vip' was built under R version 3.6.2
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
#Variable Importance (Mean Decrease in Accuracy)
vip(rf_model, type=1, num_features = 26L)
#Variable Importance (Mean Decrease in Gini)
vip(rf_model, type=2, num_features = 26L)
#Using 10-fold cross-validation
#Create a random forest model with parameter mtry = 2
set.seed(123)
rf_model2 <- randomForest(shares_cat ~., data = train, mtry = 2, importance = TRUE)
#Print the model
print(rf_model2)
##
## Call:
## randomForest(formula = shares_cat ~ ., data = train, mtry = 2, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 33.84%
## Confusion matrix:
## 0 1 class.error
## 0 12210 5863 0.3244066
## 1 6209 11397 0.3526639
#Number of trees: 500
#No. of variables tried at each split: 2
#OOB estimate of error rate: 33.84%
#Confusion matrix:
# 0 1 class.error
#0 12210 5863 0.3244066
#1 6209 11397 0.3526639
#Plot the results
plot(rf_model2)
#The errors from the model decrease until around 200 trees and then flatten.
#Test the model
rf_predict2 <- predict(rf_model2, test)
#Confusion Matrix and Statistics
rf_confusionMatrix_stats2 <- confusionMatrix(rf_predict2, test$shares_cat, mode = "everything", positive = "1")
print(rf_confusionMatrix_stats2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1394 712
## 1 615 1244
##
## Accuracy : 0.6653
## 95% CI : (0.6504, 0.68)
## No Information Rate : 0.5067
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3301
##
## Mcnemar's Test P-Value : 0.008405
##
## Sensitivity : 0.6360
## Specificity : 0.6939
## Pos Pred Value : 0.6692
## Neg Pred Value : 0.6619
## Precision : 0.6692
## Recall : 0.6360
## F1 : 0.6522
## Prevalence : 0.4933
## Detection Rate : 0.3137
## Detection Prevalence : 0.4689
## Balanced Accuracy : 0.6649
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1394 712
# 1 615 1244
#Accuracy : 0.6653
#95% CI : (0.6504, 0.68)
#No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.3301
#Mcnemar's Test P-Value : 0.008405
# Sensitivity : 0.6360
# Specificity : 0.6939
# Pos Pred Value : 0.6692
# Neg Pred Value : 0.6619
# Precision : 0.6692
# Recall : 0.6360
# F1 : 0.6522
# Prevalence : 0.4933
# Detection Rate : 0.3137
# Detection Prevalence : 0.4689
# Balanced Accuracy : 0.6649
#AUC Score
rfauc2 <- roc(test$shares_cat, as.numeric(levels(rf_predict2))[as.integer(rf_predict2)])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(rfauc2)
##
## Call:
## roc.default(response = test$shares_cat, predictor = as.numeric(levels(rf_predict2))[as.integer(rf_predict2)])
##
## Data: as.numeric(levels(rf_predict2))[as.integer(rf_predict2)] in 2009 controls (test$shares_cat 0) < 1956 cases (test$shares_cat 1).
## Area under the curve: 0.6649
#Area under the curve: 0.6649
#Plot AUC
plot(rfauc2)
#Plot ROC curve
rfroc_pred2 <- prediction(as.numeric(levels(rf_predict2))[as.integer(rf_predict2)], test$shares_cat)
rf_performance2 <- performance(rfroc_pred2, "tpr", "fpr")
plot(rf_performance2, colorize=TRUE)
#Check the important variables
importance(rf_model2)
## 0 1 MeanDecreaseAccuracy
## kw_max_avg 29.371215 34.970232 51.190404
## self_reference_min_shares 33.307007 34.013098 46.813368
## LDA_00 22.184025 21.376196 35.448471
## kw_min_avg 38.957265 1.066518 39.472782
## LDA_02 37.135689 6.597171 43.750978
## weekday_is_saturday 38.198417 20.948111 37.403561
## LDA_04 19.936763 20.492122 36.847525
## data_channel_is_socmed 19.662243 32.743766 35.476738
## data_channel_is_entertainment 29.736828 10.271871 34.176360
## LDA_01 24.424007 11.732300 32.296765
## LDA_03 21.912461 7.127194 29.180045
## n_non_stop_unique_tokens 26.616667 6.499903 27.847808
## num_hrefs 18.582652 13.607931 26.458185
## kw_max_max 17.516106 15.750448 26.899394
## num_imgs 29.580147 5.876901 30.205263
## kw_avg_max 27.298916 2.245783 29.179395
## weekday_is_sunday 24.716104 15.189719 25.542122
## average_token_length 23.556573 1.884194 22.760132
## global_rate_positive_words 11.139501 14.678262 21.486219
## global_subjectivity 10.654194 18.558555 23.593209
## kw_max_min 13.705562 11.878612 18.255461
## avg_positive_polarity 3.789470 12.216383 13.816956
## global_rate_negative_words 10.396472 8.772201 15.219374
## avg_negative_polarity 7.652149 8.834470 14.198962
## n_tokens_title 3.952239 4.255027 5.866347
## title_sentiment_polarity 10.220610 3.092586 9.694414
## MeanDecreaseGini
## kw_max_avg 1048.0162
## self_reference_min_shares 917.2052
## LDA_00 806.8836
## kw_min_avg 710.9007
## LDA_02 930.2219
## weekday_is_saturday 148.8817
## LDA_04 833.7366
## data_channel_is_socmed 130.1875
## data_channel_is_entertainment 169.3873
## LDA_01 812.4145
## LDA_03 778.7597
## n_non_stop_unique_tokens 805.7213
## num_hrefs 644.3131
## kw_max_max 237.4172
## num_imgs 451.3926
## kw_avg_max 806.7945
## weekday_is_sunday 114.3049
## average_token_length 763.7803
## global_rate_positive_words 775.4221
## global_subjectivity 798.0035
## kw_max_min 764.1800
## avg_positive_polarity 742.6808
## global_rate_negative_words 734.1997
## avg_negative_polarity 715.4689
## n_tokens_title 480.0319
## title_sentiment_polarity 505.5709
# 0 1 MeanDecreaseAccuracy MeanDecreaseGini
#kw_max_avg 29.371215 34.970232 51.190404 1048.0162
#self_reference_min_shares 33.307007 34.013098 46.813368 917.2052
#LDA_00 22.184025 21.376196 35.448471 806.8836
#kw_min_avg 38.957265 1.066518 39.472782 710.9007
#LDA_02 37.135689 6.597171 43.750978 930.2219
#weekday_is_saturday 38.198417 20.948111 37.403561 148.8817
#LDA_04 19.936763 20.492122 36.847525 833.7366
#data_channel_is_socmed 19.662243 32.743766 35.476738 130.1875
#data_channel_is_entertainment 29.736828 10.271871 34.176360 169.3873
#LDA_01 24.424007 11.732300 32.296765 812.4145
#LDA_03 21.912461 7.127194 29.180045 778.7597
#n_non_stop_unique_tokens 26.616667 6.499903 27.847808 805.7213
#num_hrefs 18.582652 13.607931 26.458185 644.3131
#kw_max_max 17.516106 15.750448 26.899394 237.4172
#num_imgs 29.580147 5.876901 30.205263 451.3926
#kw_avg_max 27.298916 2.245783 29.179395 806.7945
#weekday_is_sunday 24.716104 15.189719 25.542122 114.3049
#average_token_length 23.556573 1.884194 22.760132 763.7803
#global_rate_positive_words 11.139501 14.678262 21.486219 775.4221
#global_subjectivity 10.654194 18.558555 23.593209 798.0035
#kw_max_min 13.705562 11.878612 18.255461 764.1800
#avg_positive_polarity 3.789470 12.216383 13.816956 742.6808
#global_rate_negative_words 10.396472 8.772201 15.219374 734.1997
#avg_negative_polarity 7.652149 8.834470 14.198962 715.4689
#n_tokens_title 3.952239 4.255027 5.866347 480.0319
#title_sentiment_polarity 10.220610 3.092586 9.694414 505.5709
#Evaluate variable importance according to mean decrease in accuracy and mean decrease in gini
#Mean Decrease in Accuracy
rf_imp_class2 = importance(rf_model2, type=1)
rf_imp_class2 <- data.frame(predictors=rownames(rf_imp_class2),rf_imp_class2)
#Mean Decrease Gini
rf_imp_gini2 = importance(rf_model2, type=2)
rf_imp_gini2 <- data.frame(predictors=rownames(rf_imp_gini2),rf_imp_gini2)
#Order the predictors by importance according to Mean Decrease in Accuracy
rf_imp_class2.sort <- arrange(rf_imp_class2, desc(MeanDecreaseAccuracy))
rf_imp_class2.sort$predictors <- factor(rf_imp_class2.sort$predictors,levels=rf_imp_class2.sort$predictors)
#Print the predictors sorted by Mean Decrease in Accuracy
print(rf_imp_class2.sort)
## predictors MeanDecreaseAccuracy
## 1 kw_max_avg 51.190404
## 2 self_reference_min_shares 46.813368
## 3 LDA_02 43.750978
## 4 kw_min_avg 39.472782
## 5 weekday_is_saturday 37.403561
## 6 LDA_04 36.847525
## 7 data_channel_is_socmed 35.476738
## 8 LDA_00 35.448471
## 9 data_channel_is_entertainment 34.176360
## 10 LDA_01 32.296765
## 11 num_imgs 30.205263
## 12 LDA_03 29.180045
## 13 kw_avg_max 29.179395
## 14 n_non_stop_unique_tokens 27.847808
## 15 kw_max_max 26.899394
## 16 num_hrefs 26.458185
## 17 weekday_is_sunday 25.542122
## 18 global_subjectivity 23.593209
## 19 average_token_length 22.760132
## 20 global_rate_positive_words 21.486219
## 21 kw_max_min 18.255461
## 22 global_rate_negative_words 15.219374
## 23 avg_negative_polarity 14.198962
## 24 avg_positive_polarity 13.816956
## 25 title_sentiment_polarity 9.694414
## 26 n_tokens_title 5.866347
# predictors MeanDecreaseAccuracy
#1 kw_max_avg 51.190404
#2 self_reference_min_shares 46.813368
#3 LDA_02 43.750978
#4 kw_min_avg 39.472782
#5 weekday_is_saturday 37.403561
#6 LDA_04 36.847525
#7 data_channel_is_socmed 35.476738
#8 LDA_00 35.448471
#9 data_channel_is_entertainment 34.176360
#10 LDA_01 32.296765
#11 num_imgs 30.205263
#12 LDA_03 29.180045
#13 kw_avg_max 29.179395
#14 n_non_stop_unique_tokens 27.847808
#15 kw_max_max 26.899394
#16 num_hrefs 26.458185
#17 weekday_is_sunday 25.542122
#18 global_subjectivity 23.593209
#19 average_token_length 22.760132
#20 global_rate_positive_words 21.486219
#21 kw_max_min 18.255461
#22 global_rate_negative_words 15.219374
#23 avg_negative_polarity 14.198962
#24 avg_positive_polarity 13.816956
#25 title_sentiment_polarity 9.694414
#26 n_tokens_title 5.866347
#The top predictors in terms of Mean Decrease in Accuracy include maximum shares of the average keyword, minimum shares
#of referenced articles in Mashable, closeness to LDA topic 2, minimum shares of the average keyword, whether the article
#was posted on Saturday, closeness to LDA topic 4, whether the data channel is social media, closeness to LDA topic 0,
#whether the data channel is entertainment, closeness to LDA topic 1, and so on.
#Order the predictors by importance according to Mean Decrease in Gini
rf_imp_gini2.sort <- arrange(rf_imp_gini2, desc(rf_imp_gini2$MeanDecreaseGini))
rf_imp_gini2.sort$predictors <- factor(rf_imp_gini2.sort$predictors,levels=rf_imp_gini2.sort$predictors)
#Print the predictors sorted by Mean Decrease in Gini
print(rf_imp_gini2.sort)
## predictors MeanDecreaseGini
## 1 kw_max_avg 1048.0162
## 2 LDA_02 930.2219
## 3 self_reference_min_shares 917.2052
## 4 LDA_04 833.7366
## 5 LDA_01 812.4145
## 6 LDA_00 806.8836
## 7 kw_avg_max 806.7945
## 8 n_non_stop_unique_tokens 805.7213
## 9 global_subjectivity 798.0035
## 10 LDA_03 778.7597
## 11 global_rate_positive_words 775.4221
## 12 kw_max_min 764.1800
## 13 average_token_length 763.7803
## 14 avg_positive_polarity 742.6808
## 15 global_rate_negative_words 734.1997
## 16 avg_negative_polarity 715.4689
## 17 kw_min_avg 710.9007
## 18 num_hrefs 644.3131
## 19 title_sentiment_polarity 505.5709
## 20 n_tokens_title 480.0319
## 21 num_imgs 451.3926
## 22 kw_max_max 237.4172
## 23 data_channel_is_entertainment 169.3873
## 24 weekday_is_saturday 148.8817
## 25 data_channel_is_socmed 130.1875
## 26 weekday_is_sunday 114.3049
# predictors MeanDecreaseGini
#1 kw_max_avg 1048.0162
#2 LDA_02 930.2219
#3 self_reference_min_shares 917.2052
#4 LDA_04 833.7366
#5 LDA_01 812.4145
#6 LDA_00 806.8836
#7 kw_avg_max 806.7945
#8 n_non_stop_unique_tokens 805.7213
#9 global_subjectivity 798.0035
#10 LDA_03 778.7597
#11 global_rate_positive_words 775.4221
#12 kw_max_min 764.1800
#13 average_token_length 763.7803
#14 avg_positive_polarity 742.6808
#15 global_rate_negative_words 734.1997
#16 avg_negative_polarity 715.4689
#17 kw_min_avg 710.9007
#18 num_hrefs 644.3131
#19 title_sentiment_polarity 505.5709
#20 n_tokens_title 480.0319
#21 num_imgs 451.3926
#22 kw_max_max 237.4172
#23 data_channel_is_entertainment 169.3873
#24 weekday_is_saturday 148.8817
#25 data_channel_is_socmed 130.1875
#26 weekday_is_sunday 114.3049
#The top predictors in terms of Mean Decrease in Gini include maximum shares of the average keyword, closeness to LDA
#topic 2, minimum shares of referenced articles in Mashable, closeness to LDA topic 4, closeness to LDA topic 1,
#closeness to LDA topic 0, average shares of the best keyword, rate of unique non-stop words in the content, text
#subjectivity, closeness to LDA topic 3, and so on.
#Comparing important variables found using mean decrease in accuracy with mean decrease in gini suggests the most #important variables are maximum shares of the average keyword, minimum shares of referenced articles in Mashable, #closeness to LDA topic 2, closeness to LDA topic 4, closeness to LDA topic 0, and closeness to LDA topic 1.
#Plot Important Variables
varImpPlot(rf_model2, type=1, main = "Variable Importance (Mean Decrease in Accuracy)")
varImpPlot(rf_model2, type=2, main = "Variable Importance (Mean Decrease in Gini)")
varImpPlot(rf_model2, main = "Variable Importance")
#Variable Importance (Mean Decrease in Accuracy)
vip(rf_model2, type=1, num_features = 26L)
#Variable Importance (Mean Decrease in Gini)
vip(rf_model2, type=2, num_features = 26L)
#Gradient Boosting Machine
#install.packages("gbm")
library(gbm)
## Loaded gbm 2.1.5
#Train the model with different tuning parameters
set.seed(123)
gbmfit <- train(shares_cat ~., data = train.set, method = "gbm", tuneLength = 5,
trControl = trainControl(method = "cv", search = "grid"), verbose = FALSE, distribution = "bernoulli")
#Print the training model to see the accuracy
print(gbmfit)
## Stochastic Gradient Boosting
##
## 27750 samples
## 26 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24974, 24975, 24975, 24975, 24975, 24975, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.6430993 0.2857176
## 1 100 0.6496583 0.2988222
## 1 150 0.6515318 0.3026774
## 1 200 0.6538017 0.3072533
## 1 250 0.6560001 0.3116892
## 2 50 0.6504868 0.3006533
## 2 100 0.6546308 0.3089990
## 2 150 0.6580904 0.3159702
## 2 200 0.6587030 0.3171902
## 2 250 0.6596037 0.3189818
## 3 50 0.6538740 0.3075327
## 3 100 0.6590271 0.3178898
## 3 150 0.6593154 0.3184361
## 3 200 0.6603244 0.3204782
## 3 250 0.6593874 0.3185842
## 4 50 0.6561444 0.3120997
## 4 100 0.6608653 0.3215290
## 4 150 0.6602887 0.3203715
## 4 200 0.6613697 0.3225514
## 4 250 0.6617299 0.3232823
## 5 50 0.6575858 0.3150339
## 5 100 0.6610454 0.3219121
## 5 150 0.6620181 0.3238679
## 5 200 0.6618379 0.3234836
## 5 250 0.6618741 0.3235657
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
#interaction.depth n.trees Accuracy Kappa
#1 50 0.6430993 0.2857176
#1 100 0.6496583 0.2988222
#1 150 0.6515318 0.3026774
#1 200 0.6538017 0.3072533
#1 250 0.6560001 0.3116892
#2 50 0.6504868 0.3006533
#2 100 0.6546308 0.3089990
#2 150 0.6580904 0.3159702
#2 200 0.6587030 0.3171902
#2 250 0.6596037 0.3189818
#3 50 0.6538740 0.3075327
#3 100 0.6590271 0.3178898
#3 150 0.6593154 0.3184361
#3 200 0.6603244 0.3204782
#3 250 0.6593874 0.3185842
#4 50 0.6561444 0.3120997
#4 100 0.6608653 0.3215290
#4 150 0.6602887 0.3203715
#4 200 0.6613697 0.3225514
#4 250 0.6617299 0.3232823
#5 50 0.6575858 0.3150339
#5 100 0.6610454 0.3219121
#5 150 0.6620181 0.3238679
#5 200 0.6618379 0.3234836
#5 250 0.6618741 0.3235657
#Tuning parameter 'shrinkage' was held constant at a value of 0.1
#Tuning parameter 'n.minobsinnode' was held constant at
#a value of 10
#Accuracy was used to select the optimal model using the largest value.
#The final values used for the model were n.trees = 150, interaction.depth = 5, shrinkage = 0.1 and n.minobsinnode = 10.
#Plot the results
plot(gbmfit)
#Important variables that have the largest influence on popularity
summary(gbmfit)
## var rel.inf
## kw_max_avg kw_max_avg 12.6571144
## self_reference_min_shares self_reference_min_shares 11.8560679
## kw_min_avg kw_min_avg 7.5172750
## LDA_02 LDA_02 7.3604733
## data_channel_is_entertainment1 data_channel_is_entertainment1 6.3975418
## weekday_is_saturday1 weekday_is_saturday1 5.2460129
## LDA_04 LDA_04 4.6223684
## n_non_stop_unique_tokens n_non_stop_unique_tokens 4.5386342
## kw_max_max kw_max_max 3.6010261
## weekday_is_sunday1 weekday_is_sunday1 3.5225634
## data_channel_is_socmed1 data_channel_is_socmed1 3.1720619
## num_hrefs num_hrefs 3.0009931
## LDA_01 LDA_01 2.8673100
## LDA_00 LDA_00 2.8246000
## kw_avg_max kw_avg_max 2.8200655
## global_subjectivity global_subjectivity 2.5722805
## kw_max_min kw_max_min 2.4256271
## global_rate_positive_words global_rate_positive_words 2.2880355
## num_imgs num_imgs 2.1627667
## LDA_03 LDA_03 2.1558032
## title_sentiment_polarity title_sentiment_polarity 1.7554967
## avg_positive_polarity avg_positive_polarity 1.1621387
## global_rate_negative_words global_rate_negative_words 1.1050324
## avg_negative_polarity avg_negative_polarity 1.0281309
## average_token_length average_token_length 0.9962160
## n_tokens_title n_tokens_title 0.3443644
# var rel.inf
#kw_max_avg kw_max_avg 12.6571144
#self_reference_min_shares self_reference_min_shares 11.8560679
#kw_min_avg kw_min_avg 7.5172750
#LDA_02 LDA_02 7.3604733
#data_channel_is_entertainment1 data_channel_is_entertainment1 6.3975418
#weekday_is_saturday1 weekday_is_saturday1 5.2460129
#LDA_04 LDA_04 4.6223684
#n_non_stop_unique_tokens n_non_stop_unique_tokens 4.5386342
#kw_max_max kw_max_max 3.6010261
#weekday_is_sunday1 weekday_is_sunday1 3.5225634
#data_channel_is_socmed1 data_channel_is_socmed1 3.1720619
#num_hrefs num_hrefs 3.0009931
#LDA_01 LDA_01 2.8673100
#LDA_00 LDA_00 2.8246000
#kw_avg_max kw_avg_max 2.8200655
#global_subjectivity global_subjectivity 2.5722805
#kw_max_min kw_max_min 2.4256271
#global_rate_positive_words global_rate_positive_words 2.2880355
#num_imgs num_imgs 2.1627667
#LDA_03 LDA_03 2.1558032
#title_sentiment_polarity title_sentiment_polarity 1.7554967
#avg_positive_polarity avg_positive_polarity 1.1621387
#global_rate_negative_words global_rate_negative_words 1.1050324
#avg_negative_polarity avg_negative_polarity 1.0281309
#average_token_length average_token_length 0.9962160
#n_tokens_title n_tokens_title 0.3443644
#The most important variables found are maximum shares of the average keyword, minimum shares of referenced articles in
#Mashable, minimum shares of the average keyword, closeness to LDA topic 2, whether the data channel is entertainment,
#whether the article was published on Saturday, closeness to LDA topic 4, rate of unique non-stop words in the content,
#maximum shares of the best keyword, whether the article was published on Sunday, and so on.
vip(gbmfit, num_features = 26L)
#Since the response only has 2 unique values (0,1), a bernoulli distribution should be specified.
#Change dependent variable to numeric instead of factor to get (0,1) observations in order to use the bernoulli
#distribution in the gbm() function.
train2 <- train
test2 <- test
train.set2 <- train.set
valid.set2 <- valid.set
test.set2 <- test.set
train2$shares_cat <- as.numeric(levels(train2$shares_cat))[as.integer(train2$shares_cat)]
train2$weekday_is_saturday <- as.numeric(levels(train2$weekday_is_saturday))[as.integer(train2$weekday_is_saturday)]
train2$weekday_is_sunday <- as.numeric(levels(train2$weekday_is_sunday))[as.integer(train2$weekday_is_sunday)]
train2$data_channel_is_socmed <- as.numeric(levels(train2$data_channel_is_socmed))[as.integer(train2$data_channel_is_socmed)]
train2$data_channel_is_entertainment <- as.numeric(levels(train2$data_channel_is_entertainment))[as.integer(train2$data_channel_is_entertainment)]
test2$shares_cat <- as.numeric(levels(test2$shares_cat))[as.integer(test2$shares_cat)]
test2$weekday_is_saturday <- as.numeric(levels(test2$weekday_is_saturday))[as.integer(test2$weekday_is_saturday)]
test2$weekday_is_sunday <- as.numeric(levels(test2$weekday_is_sunday))[as.integer(test2$weekday_is_sunday)]
test2$data_channel_is_socmed <- as.numeric(levels(test2$data_channel_is_socmed))[as.integer(test2$data_channel_is_socmed)]
test2$data_channel_is_entertainment <- as.numeric(levels(test2$data_channel_is_entertainment))[as.integer(test2$data_channel_is_entertainment)]
train.set2$shares_cat <- as.numeric(levels(train.set2$shares_cat))[as.integer(train.set2$shares_cat)]
train.set2$weekday_is_saturday <- as.numeric(levels(train.set2$weekday_is_saturday))[as.integer(train.set2$weekday_is_saturday)]
train.set2$weekday_is_sunday <- as.numeric(levels(train.set2$weekday_is_sunday))[as.integer(train.set2$weekday_is_sunday)]
train.set2$data_channel_is_socmed <- as.numeric(levels(train.set2$data_channel_is_socmed))[as.integer(train.set2$data_channel_is_socmed)]
train.set2$data_channel_is_entertainment <- as.numeric(levels(train.set2$data_channel_is_entertainment))[as.integer(train.set2$data_channel_is_entertainment)]
test.set2$shares_cat <- as.numeric(levels(test.set2$shares_cat))[as.integer(test.set2$shares_cat)]
test.set2$weekday_is_saturday <- as.numeric(levels(test.set2$weekday_is_saturday))[as.integer(test.set2$weekday_is_saturday)]
test.set2$weekday_is_sunday <- as.numeric(levels(test.set2$weekday_is_sunday))[as.integer(test.set2$weekday_is_sunday)]
test.set2$data_channel_is_socmed <- as.numeric(levels(test.set2$data_channel_is_socmed))[as.integer(test.set2$data_channel_is_socmed)]
test.set2$data_channel_is_entertainment <- as.numeric(levels(test.set2$data_channel_is_entertainment))[as.integer(test.set2$data_channel_is_entertainment)]
valid.set2$shares_cat <- as.numeric(levels(valid.set2$shares_cat))[as.integer(valid.set2$shares_cat)]
valid.set2$weekday_is_saturday <- as.numeric(levels(valid.set2$weekday_is_saturday))[as.integer(valid.set2$weekday_is_saturday)]
valid.set2$weekday_is_sunday <- as.numeric(levels(valid.set2$weekday_is_sunday))[as.integer(valid.set2$weekday_is_sunday)]
valid.set2$data_channel_is_socmed <- as.numeric(levels(valid.set2$data_channel_is_socmed))[as.integer(valid.set2$data_channel_is_socmed)]
valid.set2$data_channel_is_entertainment <- as.numeric(levels(valid.set2$data_channel_is_entertainment))[as.integer(valid.set2$data_channel_is_entertainment)]
#Create a GBM model with the validation set using the best parameters
#The final values found for the model were n.trees = 150, interaction.depth = 5, shrinkage = 0.1, and n.minobsinnode = 10
#where n.trees is the number of trees,
#interaction.depth is is the maximum depth of each tree,
#shrinkage is the learning rate applied to each tree in the expansion,
#and n.minobsinnode is the number of observations in the terminal nodes of the trees.
set.seed(123)
gbm_model <- gbm(shares_cat ~., data = valid.set2, n.trees = 150, interaction.depth = 5, shrinkage = 0.1,
n.minobsinnode = 10, distribution = "bernoulli")
#Print the model
print(gbm_model)
## gbm(formula = shares_cat ~ ., distribution = "bernoulli", data = valid.set2,
## n.trees = 150, interaction.depth = 5, n.minobsinnode = 10,
## shrinkage = 0.1)
## A gradient boosted model with bernoulli loss function.
## 150 iterations were performed.
## There were 26 predictors of which 26 had non-zero influence.
#A gradient boosted model with bernoulli loss function.
#150 iterations were performed.
#There were 26 predictors of which 26 had non-zero influence.
#Plot the results (marginal plot of fitted GBM objects)
plot(gbm_model)
#The marginal plot shows the marginal effect between the maximum shares of the average keyword and the dependent
#variable, popularity of an article.
#This plot suggests there is a postive effect at the bottom 20% of the distribution of maximum shares of the average
#keyword.
#Important variables
summary(gbm_model)
## var rel.inf
## kw_max_avg kw_max_avg 10.3582377
## self_reference_min_shares self_reference_min_shares 8.6723327
## n_non_stop_unique_tokens n_non_stop_unique_tokens 7.3430337
## LDA_04 LDA_04 6.0129047
## kw_min_avg kw_min_avg 5.5262958
## LDA_02 LDA_02 5.1670884
## LDA_00 LDA_00 4.4950009
## kw_max_min kw_max_min 4.4593690
## LDA_01 LDA_01 4.0357731
## weekday_is_saturday weekday_is_saturday 3.9389358
## global_rate_negative_words global_rate_negative_words 3.6805618
## average_token_length average_token_length 3.5220553
## global_subjectivity global_subjectivity 3.5032973
## kw_avg_max kw_avg_max 3.4919890
## num_hrefs num_hrefs 3.1470426
## LDA_03 LDA_03 3.1283730
## avg_positive_polarity avg_positive_polarity 2.9307724
## avg_negative_polarity avg_negative_polarity 2.6840701
## title_sentiment_polarity title_sentiment_polarity 2.5734181
## kw_max_max kw_max_max 2.2448493
## global_rate_positive_words global_rate_positive_words 2.2418899
## weekday_is_sunday weekday_is_sunday 2.0075215
## num_imgs num_imgs 1.5013011
## data_channel_is_socmed data_channel_is_socmed 1.2729848
## data_channel_is_entertainment data_channel_is_entertainment 1.2144479
## n_tokens_title n_tokens_title 0.8464541
# var rel.inf
#kw_max_avg kw_max_avg 10.3582377
#self_reference_min_shares self_reference_min_shares 8.6723327
#n_non_stop_unique_tokens n_non_stop_unique_tokens 7.3430337
#LDA_04 LDA_04 6.0129047
#kw_min_avg kw_min_avg 5.5262958
#LDA_02 LDA_02 5.1670884
#LDA_00 LDA_00 4.4950009
#kw_max_min kw_max_min 4.4593690
#LDA_01 LDA_01 4.0357731
#weekday_is_saturday weekday_is_saturday 3.9389358
#global_rate_negative_words global_rate_negative_words 3.6805618
#average_token_length average_token_length 3.5220553
#global_subjectivity global_subjectivity 3.5032973
#kw_avg_max kw_avg_max 3.4919890
#num_hrefs num_hrefs 3.1470426
#LDA_03 LDA_03 3.1283730
#avg_positive_polarity avg_positive_polarity 2.9307724
#avg_negative_polarity avg_negative_polarity 2.6840701
#title_sentiment_polarity title_sentiment_polarity 2.5734181
#kw_max_max kw_max_max 2.2448493
#global_rate_positive_words global_rate_positive_words 2.2418899
#weekday_is_sunday weekday_is_sunday 2.0075215
#num_imgs num_imgs 1.5013011
#data_channel_is_socmed data_channel_is_socmed 1.2729848
#data_channel_is_entertainment data_channel_is_entertainment 1.2144479
#n_tokens_title n_tokens_title 0.8464541
#The most important variables found are maximum shares of the average keyword, minimum shares of referenced articles in
#Mashable, rate of unique non-stop words in the content, closeness to LDA topic 4, minimum shares of the average keyword,
#closeness to LDA topic 2, closeness to LDA topic 0, maximum shares of the worst keyword, closeness to LDA topic 1,
#whether the article was published on Saturday, and so on.
vip(gbm_model, num_features = 26L)
#Make predictions using the test data
gbm_predict <- predict(object = gbm_model, newdata = test.set2, n.trees = 150, type = "response")
#Create binary observations from the predictions
#Use the ifelse function to predict if an article is popular with a percentage/probability greater than 50%
gbm_predict_binary <- as.factor(ifelse(gbm_predict > 0.5, 1, 0))
test.set2$shares_cat <- as.factor(test.set2$shares_cat)
#Confusion Matrix and Statistics
gbm_confusionMatrix_stats <- confusionMatrix(gbm_predict_binary, test.set2$shares_cat, positive = "1",
mode = "everything")
print(gbm_confusionMatrix_stats)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1968 1119
## 1 1021 1839
##
## Accuracy : 0.6402
## 95% CI : (0.6278, 0.6524)
## No Information Rate : 0.5026
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.2802
##
## Mcnemar's Test P-Value : 0.03601
##
## Sensitivity : 0.6217
## Specificity : 0.6584
## Pos Pred Value : 0.6430
## Neg Pred Value : 0.6375
## Precision : 0.6430
## Recall : 0.6217
## F1 : 0.6322
## Prevalence : 0.4974
## Detection Rate : 0.3092
## Detection Prevalence : 0.4809
## Balanced Accuracy : 0.6401
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1968 1119
# 1 1021 1839
#Accuracy : 0.6402
#95% CI : (0.6278, 0.6524)
#No Information Rate : 0.5026
#P-Value [Acc > NIR] : < 2e-16
#Kappa : 0.2802
#Mcnemar's Test P-Value : 0.03601
# Sensitivity : 0.6217
# Specificity : 0.6584
# Pos Pred Value : 0.6430
# Neg Pred Value : 0.6375
# Precision : 0.6430
# Recall : 0.6217
# F1 : 0.6322
# Prevalence : 0.4974
# Detection Rate : 0.3092
# Detection Prevalence : 0.4809
# Balanced Accuracy : 0.6401
#AUC Score
gbmauc <- roc(test.set2$shares_cat, gbm_predict)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(gbmauc)
##
## Call:
## roc.default(response = test.set2$shares_cat, predictor = gbm_predict)
##
## Data: gbm_predict in 2989 controls (test.set2$shares_cat 0) < 2958 cases (test.set2$shares_cat 1).
## Area under the curve: 0.6937
#Area under the curve: 0.6937
#Plot AUC
plot(gbmauc)
#Plot ROC curve
gbmroc_pred <- prediction(gbm_predict, test.set2$shares_cat)
gbm_performance <- performance(gbmroc_pred, "tpr", "fpr")
plot(gbm_performance, colorize=TRUE)
#Using 10-fold cross-validation
#Define the GBM model and use the training data to fit the model
set.seed(123)
gbm_model2 <- gbm(shares_cat ~., data = train2, n.trees = 150, interaction.depth = 5, shrinkage = 0.1,
n.minobsinnode = 10, distribution = "bernoulli")
#Print the model
print(gbm_model2)
## gbm(formula = shares_cat ~ ., distribution = "bernoulli", data = train2,
## n.trees = 150, interaction.depth = 5, n.minobsinnode = 10,
## shrinkage = 0.1)
## A gradient boosted model with bernoulli loss function.
## 150 iterations were performed.
## There were 26 predictors of which 26 had non-zero influence.
#A gradient boosted model with bernoulli loss function.
#150 iterations were performed.
#There were 26 predictors of which 26 had non-zero influence.
#Plot the results (marginal plot of fitted GBM objects)
plot(gbm_model2)
#The marginal plot shows the marginal effect between the maximum shares of the average keyword and the dependent
#variable, popularity of an article.
#This plot suggests there is a postive effect at the bottom 20% of the distribution of maximum shares of the average
#keyword.
#Important variables
summary(gbm_model2)
## var rel.inf
## kw_max_avg kw_max_avg 14.2237351
## self_reference_min_shares self_reference_min_shares 12.8510400
## kw_min_avg kw_min_avg 7.5303339
## LDA_02 LDA_02 6.7311093
## data_channel_is_entertainment data_channel_is_entertainment 5.3914263
## weekday_is_saturday weekday_is_saturday 5.3243059
## LDA_04 LDA_04 5.1569159
## n_non_stop_unique_tokens n_non_stop_unique_tokens 4.7221051
## kw_max_max kw_max_max 3.6840664
## weekday_is_sunday weekday_is_sunday 3.4819010
## data_channel_is_socmed data_channel_is_socmed 3.2867379
## num_hrefs num_hrefs 2.9878556
## LDA_00 LDA_00 2.7808518
## num_imgs num_imgs 2.4352571
## global_subjectivity global_subjectivity 2.3983491
## kw_avg_max kw_avg_max 2.3016619
## LDA_01 LDA_01 2.2997932
## kw_max_min kw_max_min 2.2293580
## LDA_03 LDA_03 1.7798341
## global_rate_negative_words global_rate_negative_words 1.7362134
## title_sentiment_polarity title_sentiment_polarity 1.4552712
## global_rate_positive_words global_rate_positive_words 1.4354622
## average_token_length average_token_length 1.3087644
## avg_negative_polarity avg_negative_polarity 1.0602890
## avg_positive_polarity avg_positive_polarity 1.0575778
## n_tokens_title n_tokens_title 0.3497847
# var rel.inf
#kw_max_avg kw_max_avg 14.2237351
#self_reference_min_shares self_reference_min_shares 12.8510400
#kw_min_avg kw_min_avg 7.5303339
#LDA_02 LDA_02 6.7311093
#data_channel_is_entertainment data_channel_is_entertainment 5.3914263
#weekday_is_saturday weekday_is_saturday 5.3243059
#LDA_04 LDA_04 5.1569159
#n_non_stop_unique_tokens n_non_stop_unique_tokens 4.7221051
#kw_max_max kw_max_max 3.6840664
#weekday_is_sunday weekday_is_sunday 3.4819010
#data_channel_is_socmed data_channel_is_socmed 3.2867379
#num_hrefs num_hrefs 2.9878556
#LDA_00 LDA_00 2.7808518
#num_imgs num_imgs 2.4352571
#global_subjectivity global_subjectivity 2.3983491
#kw_avg_max kw_avg_max 2.3016619
#LDA_01 LDA_01 2.2997932
#kw_max_min kw_max_min 2.2293580
#LDA_03 LDA_03 1.7798341
#global_rate_negative_words global_rate_negative_words 1.7362134
#title_sentiment_polarity title_sentiment_polarity 1.4552712
#global_rate_positive_words global_rate_positive_words 1.4354622
#average_token_length average_token_length 1.3087644
#avg_negative_polarity avg_negative_polarity 1.0602890
#avg_positive_polarity avg_positive_polarity 1.0575778
#n_tokens_title n_tokens_title 0.3497847
#The most important variables found are maximum shares of the average keyword, minimum shares of referenced articles in
#Mashable, minimum shares of the average keyword, closeness to LDA topic 2, whether the data channel is entertainment,
#whether the article was published on Saturday, closeness to LDA topic 4, rate of unique non-stop words in the content,
#maximum shares of the best keyword, whether the article was published on Sunday, and so on.
vip(gbm_model2, num_features = 26L)
#The important variables obtained based on the Gradient Boosting Machine model may be compared with the ones found in #the Random Forest model (using both mean decrease in accuracy and mean decrese in gini). #Across all models, maximum shares of the average keyword is found to be the most important variable. #This is followed by minimum shares of referenced articles in Mashable, and closeness to LDA topic 2.
#Recall the important variables in the Random Forest model for comparison:
vip(rf_model2, type=1, num_features = 26L)
vip(rf_model2, type=2, num_features = 26L)
#Importance of predictors based on Random Forest sorted by Mean Decrease in Accuracy
print(rf_imp_class2.sort)
## predictors MeanDecreaseAccuracy
## 1 kw_max_avg 51.190404
## 2 self_reference_min_shares 46.813368
## 3 LDA_02 43.750978
## 4 kw_min_avg 39.472782
## 5 weekday_is_saturday 37.403561
## 6 LDA_04 36.847525
## 7 data_channel_is_socmed 35.476738
## 8 LDA_00 35.448471
## 9 data_channel_is_entertainment 34.176360
## 10 LDA_01 32.296765
## 11 num_imgs 30.205263
## 12 LDA_03 29.180045
## 13 kw_avg_max 29.179395
## 14 n_non_stop_unique_tokens 27.847808
## 15 kw_max_max 26.899394
## 16 num_hrefs 26.458185
## 17 weekday_is_sunday 25.542122
## 18 global_subjectivity 23.593209
## 19 average_token_length 22.760132
## 20 global_rate_positive_words 21.486219
## 21 kw_max_min 18.255461
## 22 global_rate_negative_words 15.219374
## 23 avg_negative_polarity 14.198962
## 24 avg_positive_polarity 13.816956
## 25 title_sentiment_polarity 9.694414
## 26 n_tokens_title 5.866347
# predictors MeanDecreaseAccuracy
#1 kw_max_avg 51.190404
#2 self_reference_min_shares 46.813368
#3 LDA_02 43.750978
#4 kw_min_avg 39.472782
#5 weekday_is_saturday 37.403561
#6 LDA_04 36.847525
#7 data_channel_is_socmed 35.476738
#8 LDA_00 35.448471
#9 data_channel_is_entertainment 34.176360
#10 LDA_01 32.296765
#11 num_imgs 30.205263
#12 LDA_03 29.180045
#13 kw_avg_max 29.179395
#14 n_non_stop_unique_tokens 27.847808
#15 kw_max_max 26.899394
#16 num_hrefs 26.458185
#17 weekday_is_sunday 25.542122
#18 global_subjectivity 23.593209
#19 average_token_length 22.760132
#20 global_rate_positive_words 21.486219
#21 kw_max_min 18.255461
#22 global_rate_negative_words 15.219374
#23 avg_negative_polarity 14.198962
#24 avg_positive_polarity 13.816956
#25 title_sentiment_polarity 9.694414
#26 n_tokens_title 5.866347
#Importance of predictors based on Random Forest sorted by Mean Decrease in Gini
print(rf_imp_gini2.sort)
## predictors MeanDecreaseGini
## 1 kw_max_avg 1048.0162
## 2 LDA_02 930.2219
## 3 self_reference_min_shares 917.2052
## 4 LDA_04 833.7366
## 5 LDA_01 812.4145
## 6 LDA_00 806.8836
## 7 kw_avg_max 806.7945
## 8 n_non_stop_unique_tokens 805.7213
## 9 global_subjectivity 798.0035
## 10 LDA_03 778.7597
## 11 global_rate_positive_words 775.4221
## 12 kw_max_min 764.1800
## 13 average_token_length 763.7803
## 14 avg_positive_polarity 742.6808
## 15 global_rate_negative_words 734.1997
## 16 avg_negative_polarity 715.4689
## 17 kw_min_avg 710.9007
## 18 num_hrefs 644.3131
## 19 title_sentiment_polarity 505.5709
## 20 n_tokens_title 480.0319
## 21 num_imgs 451.3926
## 22 kw_max_max 237.4172
## 23 data_channel_is_entertainment 169.3873
## 24 weekday_is_saturday 148.8817
## 25 data_channel_is_socmed 130.1875
## 26 weekday_is_sunday 114.3049
# predictors MeanDecreaseGini
#1 kw_max_avg 1048.0162
#2 LDA_02 930.2219
#3 self_reference_min_shares 917.2052
#4 LDA_04 833.7366
#5 LDA_01 812.4145
#6 LDA_00 806.8836
#7 kw_avg_max 806.7945
#8 n_non_stop_unique_tokens 805.7213
#9 global_subjectivity 798.0035
#10 LDA_03 778.7597
#11 global_rate_positive_words 775.4221
#12 kw_max_min 764.1800
#13 average_token_length 763.7803
#14 avg_positive_polarity 742.6808
#15 global_rate_negative_words 734.1997
#16 avg_negative_polarity 715.4689
#17 kw_min_avg 710.9007
#18 num_hrefs 644.3131
#19 title_sentiment_polarity 505.5709
#20 n_tokens_title 480.0319
#21 num_imgs 451.3926
#22 kw_max_max 237.4172
#23 data_channel_is_entertainment 169.3873
#24 weekday_is_saturday 148.8817
#25 data_channel_is_socmed 130.1875
#26 weekday_is_sunday 114.3049
#Make predictions using the test data
gbm_predict2 <- predict.gbm(gbm_model2, newdata = test2, type = "response", n.trees = 150)
#Create binary variables from predictions
gbm_predict_binary2 <- as.factor(ifelse(gbm_predict2 > 0.5, 1, 0))
test2$shares_cat <- as.factor(test2$shares_cat)
#Confusion Matrix and Stats
gbm_ConfusionMatrix_stats2 <- confusionMatrix(gbm_predict_binary2, test2$shares_cat, positive = "1",
mode = "everything")
print(gbm_ConfusionMatrix_stats2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1377 704
## 1 632 1252
##
## Accuracy : 0.6631
## 95% CI : (0.6481, 0.6778)
## No Information Rate : 0.5067
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.3257
##
## Mcnemar's Test P-Value : 0.05208
##
## Sensitivity : 0.6401
## Specificity : 0.6854
## Pos Pred Value : 0.6645
## Neg Pred Value : 0.6617
## Precision : 0.6645
## Recall : 0.6401
## F1 : 0.6521
## Prevalence : 0.4933
## Detection Rate : 0.3158
## Detection Prevalence : 0.4752
## Balanced Accuracy : 0.6627
##
## 'Positive' Class : 1
##
#Confusion Matrix and Statistics
# Reference
#Prediction 0 1
# 0 1377 704
# 1 632 1252
#Accuracy : 0.6631
#95% CI : (0.6481, 0.6778)
#No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2e-16
#Kappa : 0.3257
#Mcnemar's Test P-Value : 0.05208
# Sensitivity : 0.6401
# Specificity : 0.6854
# Pos Pred Value : 0.6645
# Neg Pred Value : 0.6617
# Precision : 0.6645
# Recall : 0.6401
# F1 : 0.6521
# Prevalence : 0.4933
# Detection Rate : 0.3158
# Detection Prevalence : 0.4752
# Balanced Accuracy : 0.6627
#AUC Score
gbmauc2 <- roc(test2$shares_cat, gbm_predict2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(gbmauc2)
##
## Call:
## roc.default(response = test2$shares_cat, predictor = gbm_predict2)
##
## Data: gbm_predict2 in 2009 controls (test2$shares_cat 0) < 1956 cases (test2$shares_cat 1).
## Area under the curve: 0.7203
#Area under the curve: 0.7203
#Plot AUC
plot(gbmauc2)
#Plot ROC curve
gbmroc_pred2 <- prediction(gbm_predict2, test2$shares_cat)
gbm_performance2 <- performance(gbmroc_pred2, "tpr", "fpr")
plot(gbm_performance2, colorize=TRUE)
#The accuracy of all the models tested may be compared to find the best model overall to solve the problem of predicting #Online News Popularity based on pre-publication features. #Overall, Random Forest performs the best with Accuracy = 66.53%, Specificity = 69.39%, Precision = 66.92%, and #F1 score = 65.22% #Gradient Boosting Machine closely follows Random Forest and has a close F1 score to Random Forest with F1 = 65.21%, #and the best Area Under the curve = 72.03% #Finally, the Decision Tree model has the best Sensitivity/Recall = 65.29%
#Remember the accuracy found for the models using 10-fold cross-validation:
#Logistic Regression #Confusion Matrix and Statistics
#Prediction 0 1 # 0 1334 742 # 1 675 1214
#Accuracy : 0.6426
#95% CI : (0.6275, 0.6576) #No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2e-16
#Kappa : 0.2848
#Mcnemar’s Test P-Value : 0.07955
#Area under the curve: 0.6883
#K-Nearest Neighbours #Confusion Matrix and Statistics
#Prediction 0 1 # 0 1335 813 # 1 674 1143
#Accuracy : 0.625
#95% CI : (0.6097, 0.6401) #No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.2491
#Mcnemar’s Test P-Value : 0.0003453
#Area under the curve: 0.6244
#Decision Trees #Confusion Matrix and Statistics
#Prediction 0 1 # 0 1212 679 # 1 797 1277
#Accuracy : 0.6277
#95% CI : (0.6125, 0.6428) #No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.2559
#Mcnemar’s Test P-Value : 0.002324
#Area under the curve: 0.6281
#Random Forest #Confusion Matrix and Statistics
#Prediction 0 1 # 0 1394 712 # 1 615 1244
#Accuracy : 0.6653
#95% CI : (0.6504, 0.68) #No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.3301
#Mcnemar’s Test P-Value : 0.008405
#Area under the curve: 0.6649
#Gradient Boosting Machine #Confusion Matrix and Statistics
#Prediction 0 1 # 0 1377 704 # 1 632 1252
#Accuracy : 0.6631
#95% CI : (0.6481, 0.6778) #No Information Rate : 0.5067
#P-Value [Acc > NIR] : < 2e-16
#Kappa : 0.3257
#Mcnemar’s Test P-Value : 0.05208
#Area under the curve: 0.7203